|
37 | 37 | "- Signup for an OpenAI Developer Account and create an API Key. See [OpenAI's developer platform](https://platform.openai.com/overview).\n", |
38 | 38 | "- Install Python\n", |
39 | 39 | "- Install and configure a python virtual environment. We recommend [Pyenv](https://github.com/pyenv/pyenv)\n", |
40 | | - "- Install the requirements for this notebook using the following command:\n", |
41 | | - "\n", |
42 | | - "```\n", |
43 | | - "pip install -r requirements.txt\n", |
44 | | - "```" |
| 40 | + "- Install the requirements for this notebook using the following command:" |
| 41 | + ] |
| 42 | + }, |
| 43 | + { |
| 44 | + "cell_type": "code", |
| 45 | + "execution_count": 188, |
| 46 | + "metadata": {}, |
| 47 | + "outputs": [], |
| 48 | + "source": [ |
| 49 | + "%pip install -r requirements.txt" |
45 | 50 | ] |
46 | 51 | }, |
47 | 52 | { |
48 | 53 | "cell_type": "code", |
49 | | - "execution_count": null, |
| 54 | + "execution_count": 189, |
50 | 55 | "metadata": {}, |
51 | 56 | "outputs": [], |
52 | 57 | "source": [ |
|
66 | 71 | }, |
67 | 72 | { |
68 | 73 | "cell_type": "code", |
69 | | - "execution_count": null, |
| 74 | + "execution_count": 190, |
70 | 75 | "metadata": {}, |
71 | 76 | "outputs": [], |
72 | 77 | "source": [ |
|
89 | 94 | }, |
90 | 95 | { |
91 | 96 | "cell_type": "code", |
92 | | - "execution_count": null, |
| 97 | + "execution_count": 191, |
93 | 98 | "metadata": {}, |
94 | 99 | "outputs": [], |
95 | 100 | "source": [ |
|
112 | 117 | }, |
113 | 118 | { |
114 | 119 | "cell_type": "code", |
115 | | - "execution_count": null, |
| 120 | + "execution_count": 192, |
116 | 121 | "metadata": {}, |
117 | 122 | "outputs": [], |
118 | 123 | "source": [ |
|
137 | 142 | "# Assumes we're using the text-embedding-ada-002 model\n", |
138 | 143 | "# See https://openai.com/pricing\n", |
139 | 144 | "def get_embedding_cost(num_tokens):\n", |
140 | | - " return num_tokens/1000*0.0001\n", |
| 145 | + " return num_tokens/1000*0.00002\n", |
141 | 146 | "\n", |
142 | 147 | "# Helper function: calculate total cost of embedding all content in the dataframe\n", |
143 | 148 | "def get_total_embeddings_cost():\n", |
|
147 | 152 | " token_len = num_tokens_from_string(text)\n", |
148 | 153 | " total_tokens = total_tokens + token_len\n", |
149 | 154 | " total_cost = get_embedding_cost(total_tokens)\n", |
150 | | - " return total_cost\n", |
151 | | - "\n", |
152 | | - "# Helper function: get embeddings for a text\n", |
153 | | - "def get_embeddings(text):\n", |
154 | | - " response = openai.Embedding.create(\n", |
155 | | - " model=\"text-embedding-ada-002\",\n", |
156 | | - " input = text.replace(\"\\n\",\" \")\n", |
157 | | - " )\n", |
158 | | - " embedding = response['data'][0]['embedding']\n", |
159 | | - " return embedding" |
| 155 | + " return total_cost" |
160 | 156 | ] |
161 | 157 | }, |
162 | 158 | { |
163 | 159 | "cell_type": "code", |
164 | | - "execution_count": null, |
| 160 | + "execution_count": 193, |
165 | 161 | "metadata": {}, |
166 | 162 | "outputs": [], |
167 | 163 | "source": [ |
|
189 | 185 | }, |
190 | 186 | { |
191 | 187 | "cell_type": "code", |
192 | | - "execution_count": null, |
| 188 | + "execution_count": 194, |
193 | 189 | "metadata": {}, |
194 | 190 | "outputs": [], |
195 | 191 | "source": [ |
196 | 192 | "###############################################################################\n", |
197 | 193 | "# Create new list with small content chunks to not hit max token limits\n", |
198 | 194 | "# Note: the maximum number of tokens for a single request is 8191\n", |
199 | | - "# https://openai.com/docs/api-reference/requests\n", |
| 195 | + "# https://platform.openai.com/docs/guides/embeddings/embedding-models\n", |
200 | 196 | "###############################################################################\n", |
201 | 197 | "# list for chunked content and embeddings\n", |
202 | 198 | "new_list = []\n", |
|
241 | 237 | }, |
242 | 238 | { |
243 | 239 | "cell_type": "code", |
244 | | - "execution_count": null, |
| 240 | + "execution_count": 195, |
| 241 | + "metadata": {}, |
| 242 | + "outputs": [], |
| 243 | + "source": [ |
| 244 | + "openai_client = openai.OpenAI()\n", |
| 245 | + "\n", |
| 246 | + "# Helper function: get embeddings for a text\n", |
| 247 | + "def get_embeddings(text):\n", |
| 248 | + " response = openai_client.embeddings.create(\n", |
| 249 | + " model=\"text-embedding-3-small\",\n", |
| 250 | + " input = text.replace(\"\\n\",\" \")\n", |
| 251 | + " )\n", |
| 252 | + " return response.data[0].embedding" |
| 253 | + ] |
| 254 | + }, |
| 255 | + { |
| 256 | + "cell_type": "code", |
| 257 | + "execution_count": 196, |
245 | 258 | "metadata": {}, |
246 | 259 | "outputs": [], |
247 | 260 | "source": [ |
|
258 | 271 | }, |
259 | 272 | { |
260 | 273 | "cell_type": "code", |
261 | | - "execution_count": null, |
| 274 | + "execution_count": 197, |
262 | 275 | "metadata": {}, |
263 | 276 | "outputs": [], |
264 | 277 | "source": [ |
265 | 278 | "# Save the dataframe with embeddings as a CSV file\n", |
266 | 279 | "df_new.to_csv('blog_data_and_embeddings.csv', index=False)\n", |
267 | 280 | "# It may also be useful to save as a json file, but we won't use this in the tutorial\n", |
268 | | - "#df_new.to_json('blog_data_and_embeddings.json')" |
| 281 | + "#df_new.to_json('blog_data_and_embeddings.json') " |
269 | 282 | ] |
270 | 283 | }, |
271 | 284 | { |
|
291 | 304 | }, |
292 | 305 | { |
293 | 306 | "cell_type": "code", |
294 | | - "execution_count": null, |
| 307 | + "execution_count": 198, |
295 | 308 | "metadata": {}, |
296 | 309 | "outputs": [], |
297 | 310 | "source": [ |
|
304 | 317 | }, |
305 | 318 | { |
306 | 319 | "cell_type": "code", |
307 | | - "execution_count": null, |
| 320 | + "execution_count": 199, |
308 | 321 | "metadata": {}, |
309 | 322 | "outputs": [], |
310 | 323 | "source": [ |
|
313 | 326 | "cur = conn.cursor()\n", |
314 | 327 | "\n", |
315 | 328 | "#install pgvector \n", |
316 | | - "cur.execute(\"CREATE EXTENSION IF NOT EXISTS vector\");\n", |
| 329 | + "cur.execute(\"CREATE EXTENSION IF NOT EXISTS vector;\")\n", |
| 330 | + "conn.commit()\n", |
| 331 | + "\n", |
| 332 | + "#install pgvectorscale \n", |
| 333 | + "cur.execute(\"CREATE EXTENSION IF NOT EXISTS vectorscale CASCADE;\")\n", |
317 | 334 | "conn.commit()\n", |
318 | 335 | "\n", |
319 | 336 | "# Register the vector type with psycopg2\n", |
|
346 | 363 | }, |
347 | 364 | { |
348 | 365 | "cell_type": "code", |
349 | | - "execution_count": null, |
| 366 | + "execution_count": 200, |
350 | 367 | "metadata": {}, |
351 | 368 | "outputs": [], |
352 | 369 | "source": [ |
|
381 | 398 | }, |
382 | 399 | { |
383 | 400 | "cell_type": "code", |
384 | | - "execution_count": null, |
| 401 | + "execution_count": 201, |
385 | 402 | "metadata": {}, |
386 | 403 | "outputs": [], |
387 | 404 | "source": [ |
|
391 | 408 | }, |
392 | 409 | { |
393 | 410 | "cell_type": "code", |
394 | | - "execution_count": null, |
| 411 | + "execution_count": 202, |
395 | 412 | "metadata": {}, |
396 | 413 | "outputs": [], |
397 | 414 | "source": [ |
|
409 | 426 | }, |
410 | 427 | { |
411 | 428 | "cell_type": "code", |
412 | | - "execution_count": null, |
| 429 | + "execution_count": 203, |
413 | 430 | "metadata": {}, |
414 | 431 | "outputs": [], |
415 | 432 | "source": [ |
|
433 | 450 | }, |
434 | 451 | { |
435 | 452 | "cell_type": "code", |
436 | | - "execution_count": null, |
| 453 | + "execution_count": 204, |
437 | 454 | "metadata": {}, |
438 | 455 | "outputs": [], |
439 | 456 | "source": [ |
|
445 | 462 | }, |
446 | 463 | { |
447 | 464 | "cell_type": "code", |
448 | | - "execution_count": null, |
| 465 | + "execution_count": 205, |
449 | 466 | "metadata": {}, |
450 | 467 | "outputs": [], |
451 | 468 | "source": [ |
|
465 | 482 | }, |
466 | 483 | { |
467 | 484 | "cell_type": "code", |
468 | | - "execution_count": null, |
| 485 | + "execution_count": 206, |
469 | 486 | "metadata": {}, |
470 | 487 | "outputs": [], |
471 | 488 | "source": [ |
472 | 489 | "# Create an index on the data for faster retrieval\n", |
473 | 490 | "# this isn't really needed for 129 vectors, but it shows the usage for larger datasets\n", |
474 | 491 | "# Note: always create this type of index after you have data already inserted into the DB\n", |
475 | 492 | "\n", |
476 | | - "#calculate the index parameters according to best practices\n", |
477 | | - "num_lists = num_records / 1000\n", |
478 | | - "if num_lists < 10:\n", |
479 | | - " num_lists = 10\n", |
480 | | - "if num_records > 1000000:\n", |
481 | | - " num_lists = math.sqrt(num_records)\n", |
482 | | - "\n", |
483 | | - "#use the cosine distance measure, which is what we'll later use for querying\n", |
484 | | - "cur.execute(f'CREATE INDEX ON embeddings USING ivfflat (embedding vector_cosine_ops) WITH (lists = {num_lists});')\n", |
485 | | - "conn.commit() " |
| 493 | + "# for different tuning suggestions check this: https://github.com/timescale/pgvectorscale?tab=readme-ov-file#tuning\n", |
| 494 | + "cur.execute('CREATE INDEX embedding_idx ON embeddings USING diskann (embedding);')\n", |
| 495 | + "conn.commit()" |
486 | 496 | ] |
487 | 497 | }, |
488 | 498 | { |
|
499 | 509 | }, |
500 | 510 | { |
501 | 511 | "cell_type": "code", |
502 | | - "execution_count": null, |
| 512 | + "execution_count": 207, |
503 | 513 | "metadata": {}, |
504 | 514 | "outputs": [], |
505 | 515 | "source": [ |
506 | 516 | "# Helper function: get text completion from OpenAI API\n", |
507 | 517 | "# Note max tokens is 4097\n", |
508 | 518 | "# Note we're using the latest gpt-3.5-turbo-0613 model\n", |
509 | | - "def get_completion_from_messages(messages, model=\"gpt-3.5-turbo-0613\", temperature=0, max_tokens=1000):\n", |
510 | | - " response = openai.ChatCompletion.create(\n", |
| 519 | + "def get_completion_from_messages(messages, model=\"gpt-4o\", temperature=0, max_tokens=1000):\n", |
| 520 | + " response = openai_client.chat.completions.create(\n", |
511 | 521 | " model=model,\n", |
512 | 522 | " messages=messages,\n", |
513 | 523 | " temperature=temperature, \n", |
514 | 524 | " max_tokens=max_tokens, \n", |
515 | 525 | " )\n", |
516 | | - " return response.choices[0].message[\"content\"]" |
| 526 | + " return response.choices[0].message.content" |
517 | 527 | ] |
518 | 528 | }, |
519 | 529 | { |
520 | 530 | "cell_type": "code", |
521 | | - "execution_count": null, |
| 531 | + "execution_count": 208, |
522 | 532 | "metadata": {}, |
523 | 533 | "outputs": [], |
524 | 534 | "source": [ |
|
547 | 557 | }, |
548 | 558 | { |
549 | 559 | "cell_type": "code", |
550 | | - "execution_count": null, |
| 560 | + "execution_count": 209, |
551 | 561 | "metadata": {}, |
552 | 562 | "outputs": [], |
553 | 563 | "source": [ |
|
557 | 567 | }, |
558 | 568 | { |
559 | 569 | "cell_type": "code", |
560 | | - "execution_count": null, |
| 570 | + "execution_count": 210, |
561 | 571 | "metadata": {}, |
562 | 572 | "outputs": [], |
563 | 573 | "source": [ |
|
590 | 600 | }, |
591 | 601 | { |
592 | 602 | "cell_type": "code", |
593 | | - "execution_count": null, |
| 603 | + "execution_count": 211, |
594 | 604 | "metadata": {}, |
595 | 605 | "outputs": [], |
596 | 606 | "source": [ |
|
601 | 611 | }, |
602 | 612 | { |
603 | 613 | "cell_type": "code", |
604 | | - "execution_count": null, |
| 614 | + "execution_count": 212, |
605 | 615 | "metadata": {}, |
606 | 616 | "outputs": [], |
607 | 617 | "source": [ |
|
629 | 639 | "name": "python", |
630 | 640 | "nbconvert_exporter": "python", |
631 | 641 | "pygments_lexer": "ipython3", |
632 | | - "version": "3.8.16" |
| 642 | + "version": "3.9.6" |
633 | 643 | } |
634 | 644 | }, |
635 | 645 | "nbformat": 4, |
|
0 commit comments