|
31 | 31 | "metadata": {}, |
32 | 32 | "source": [ |
33 | 33 | "# Documentation\n", |
34 | | - "Please read this document before getting started. \n", |
| 34 | + "Please read this document before getting started.\n", |
35 | 35 | "https://docs.google.com/document/d/1C_zZFGNjXq10P1MvEX6MM0TC7HHrkFOp9BB0P_S_2MQ" |
36 | 36 | ], |
37 | 37 | "cell_type": "markdown" |
|
46 | 46 | { |
47 | 47 | "metadata": {}, |
48 | 48 | "source": [ |
49 | | - "# labelbox\n", |
50 | 49 | "!pip3 install -q \"labelbox[data]\"" |
51 | 50 | ], |
52 | 51 | "cell_type": "code", |
|
56 | 55 | { |
57 | 56 | "metadata": {}, |
58 | 57 | "source": [ |
59 | | - "import labelbox as lb \n", |
| 58 | + "import labelbox as lb\n", |
60 | 59 | "import numpy as np\n", |
61 | 60 | "import json" |
62 | 61 | ], |
|
74 | 73 | { |
75 | 74 | "metadata": {}, |
76 | 75 | "source": [ |
77 | | - "# for custom embeddings\n", |
78 | 76 | "!pip3 install -q 'git+https://github.com/Labelbox/advlib.git'" |
79 | 77 | ], |
80 | 78 | "cell_type": "code", |
|
115 | 113 | "source": [ |
116 | 114 | "# get images from a Labelbox dataset\n", |
117 | 115 | "# Our systems start to process data after 1000 embeddings of each type, for this demo make sure your dataset is over 1000 data rows\n", |
118 | | - "dataset = client.get_dataset(\"<ADD YOUR DATASET ID>\")\n", |
| 116 | + "dataset = client.get_dataset(\"<ADD YOUR DATASET ID>\") \n", |
119 | 117 | "drs = list(dataset.export_data_rows(timeout_seconds=9999))\n", |
120 | | - "data_row_ids = [dr.uid for dr in drs]" |
| 118 | + "data_row_ids = [dr.uid for dr in drs]\n", |
| 119 | + "data_row_ids = data_row_ids[:1000] # keep the first 1000 examples for the sake of this demo" |
121 | 120 | ], |
122 | 121 | "cell_type": "code", |
123 | 122 | "outputs": [], |
|
127 | 126 | "metadata": {}, |
128 | 127 | "source": [ |
129 | 128 | "# Create the payload for custom embeddings\n", |
130 | | - "It should be a .ndjson file\n", |
131 | | - "It does not have to be created through python." |
| 129 | + "It should be a .ndjson file. \n", |
| 130 | + "Every line is a json file that finishes with a \\n character. \n", |
| 131 | + "It does not have to be created through python. " |
132 | 132 | ], |
133 | 133 | "cell_type": "markdown" |
134 | 134 | }, |
135 | 135 | { |
136 | 136 | "metadata": {}, |
137 | 137 | "source": [ |
138 | | - "\n", |
139 | | - "\n", |
140 | 138 | "nb_data_rows = len(data_row_ids)\n", |
141 | | - "# generate 1000 custom embedding vectors, of dimension 2048 each\n", |
142 | | - "# Labelbox supports custom embeddings of dimension up to 2048\n", |
| 139 | + "print(\"Number of data rows: \", nb_data_rows)\n", |
| 140 | + "# generate random vectors, of dimension 2048 each\n", |
| 141 | + "# Labelbox supports custom embedding vectors of dimension up to 2048\n", |
143 | 142 | "custom_embeddings = [list(np.random.random(2048)) for _ in range(nb_data_rows)]" |
144 | 143 | ], |
145 | 144 | "cell_type": "code", |
|
149 | 148 | { |
150 | 149 | "metadata": {}, |
151 | 150 | "source": [ |
152 | | - "# create the ndjson payload for custom embeddings\n", |
| 151 | + "# create the payload for custom embeddings\n", |
153 | 152 | "payload = []\n", |
154 | 153 | "for data_row_id,custom_embedding in zip(data_row_ids,custom_embeddings):\n", |
155 | 154 | " payload.append({\"id\": data_row_id, \"vector\": custom_embedding})\n", |
|
163 | 162 | { |
164 | 163 | "metadata": {}, |
165 | 164 | "source": [ |
166 | | - "# convert payload to ndjson file\n", |
| 165 | + "# delete any pre-existing file\n", |
| 166 | + "import os\n", |
| 167 | + "if os.path.exists(\"payload.ndjson\"):\n", |
| 168 | + " os.remove(\"payload.ndjson\")\n", |
167 | 169 | "\n", |
| 170 | + "# convert the payload to json file\n", |
168 | 171 | "with open('payload.ndjson', 'w') as f:\n", |
169 | | - " sanity_check_payload = json.dump(payload, f)\n", |
170 | | - "\n", |
171 | | - "\n", |
| 172 | + " for p in payload:\n", |
| 173 | + " f.write(json.dumps(p) + \"\\n\")\n", |
| 174 | + " # sanity_check_payload = json.dump(payload, f)" |
| 175 | + ], |
| 176 | + "cell_type": "code", |
| 177 | + "outputs": [], |
| 178 | + "execution_count": null |
| 179 | + }, |
| 180 | + { |
| 181 | + "metadata": {}, |
| 182 | + "source": [ |
172 | 183 | "# sanity check that you can read/load the file and the payload is correct\n", |
173 | 184 | "with open('payload.ndjson') as f:\n", |
174 | | - " sanity_check_payload = json.load(f)\n", |
175 | | - " \n", |
176 | | - "\n", |
177 | | - "print(\"Nb of custom embedding vectors in sanity_check_payload: \", len(sanity_check_payload))\n", |
178 | | - "# print(\"sanity_check_payload: \", sanity_check_payload)" |
| 185 | + " sanity_check_payload = [json.loads(l) for l in f.readlines()]\n", |
| 186 | + "print(\"Nb of custom embedding vectors in sanity_check_payload: \", len(sanity_check_payload))" |
179 | 187 | ], |
180 | 188 | "cell_type": "code", |
181 | 189 | "outputs": [], |
|
184 | 192 | { |
185 | 193 | "metadata": {}, |
186 | 194 | "source": [ |
187 | | - "# See all custom embeddings available\n", |
| 195 | + "# See all custom embeddings available in your Labelbox workspace\n", |
188 | 196 | "!advtool embeddings list" |
189 | 197 | ], |
190 | 198 | "cell_type": "code", |
|
194 | 202 | { |
195 | 203 | "metadata": {}, |
196 | 204 | "source": [ |
197 | | - "# # Create a new custom embedding\n", |
| 205 | + "# # Create a new custom embedding, unless you want to re-use one\n", |
198 | 206 | "!advtool embeddings create my_custom_embedding_2048_dimensions 2048\n", |
199 | | - "# will return the ID of the newly created embedding, e.g. cgbjjt5ra07710005liytdf19" |
| 207 | + "# this command will return the ID of the newly created embedding, e.g. ciqtgd94607290000ljx4dvh2" |
200 | 208 | ], |
201 | 209 | "cell_type": "code", |
202 | 210 | "outputs": [], |
|
206 | 214 | "metadata": {}, |
207 | 215 | "source": [ |
208 | 216 | "# # Delete a custom embedding\n", |
209 | | - "# !advtool embeddings delete cj7j0ukre0771000blj4qnxgn" |
| 217 | + "# !advtool embeddings delete ciqtgd94607290000ljx4dvh2" |
210 | 218 | ], |
211 | 219 | "cell_type": "code", |
212 | 220 | "outputs": [], |
|
222 | 230 | { |
223 | 231 | "metadata": {}, |
224 | 232 | "source": [ |
225 | | - "# Upload the payload to Labelbox \n", |
226 | | - "!advtool embeddings import cj7j0ukre0771000blj4qnxgn ./payload.ndjson" |
| 233 | + "# Replace the current id with the newly generated id from the previous step, or any existing custom embedding id.\n", |
| 234 | + "!advtool embeddings import c933bviqn0756000elk07et77 ./payload.ndjson" |
227 | 235 | ], |
228 | 236 | "cell_type": "code", |
229 | 237 | "outputs": [], |
|
240 | 248 | "metadata": {}, |
241 | 249 | "source": [ |
242 | 250 | "# count how many data rows have a specific custom embedding (This can take a couple of minutes)\n", |
243 | | - "!advtool embeddings count cj7j0ukre0771000blj4qnxgn" |
| 251 | + "!advtool embeddings count c933bviqn0756000elk07et77" |
244 | 252 | ], |
245 | 253 | "cell_type": "code", |
246 | 254 | "outputs": [], |
|
0 commit comments