Skip to content

Commit dbb369f

Browse files
committed
add multi-modal image search
1 parent 3dc5256 commit dbb369f

File tree

15 files changed

+3168
-0
lines changed

15 files changed

+3168
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
# 현재 작업 중 입니다.

genai/aws-gen-ai-kr/20_applications/06_multi_modal_image_search/data/berkely/berkely_image_embedding.json

Lines changed: 1 addition & 0 deletions
Large diffs are not rendered by default.
Lines changed: 342 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,342 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "markdown",
5+
"id": "4ab94990-19bf-4d3d-abfe-f445f3b26905",
6+
"metadata": {},
7+
"source": [
8+
"# 필수 패키지 설정, OpenSearch 클러스터 생성, Nori 플러그인 설치 (약 50분 소요)\n",
9+
"> 이 노트북은 SageMaker Studio* **`Data Science 3.0`** kernel 및 ml.t3.medium 인스턴스에서 테스트 되었습니다."
10+
]
11+
},
12+
{
13+
"cell_type": "markdown",
14+
"id": "0ae1ea41-92c7-4dc2-8dfa-36585c36d1a5",
15+
"metadata": {},
16+
"source": [
17+
"## 0. 필수 사항\n",
18+
"- 실습을 위해서 노트북을 실행하는 역할(Role) 에 아래 권한이 추가 되어 있어야 합니다.\n",
19+
" - AmazonOpenSearchServiceFullAccess\n",
20+
" - AmazonSSMFullAccess"
21+
]
22+
},
23+
{
24+
"cell_type": "markdown",
25+
"id": "2b5787ac-54cc-4a5e-8f99-86f7a5e3a066",
26+
"metadata": {},
27+
"source": [
28+
"## OpenSearch Client, 인증정보 SSM에 저장, 한국어 분석을 위한 노리(Nori) 플러그인 설치 스크립트"
29+
]
30+
},
31+
{
32+
"cell_type": "code",
33+
"execution_count": null,
34+
"id": "b7ccaebc-1f3b-4da4-92bd-565bc0d4f7b8",
35+
"metadata": {
36+
"tags": []
37+
},
38+
"outputs": [],
39+
"source": [
40+
"%load_ext autoreload\n",
41+
"%autoreload 2\n",
42+
"\n",
43+
"import sys, os\n",
44+
"\n",
45+
"def add_python_path(module_path):\n",
46+
" if os.path.abspath(module_path) not in sys.path:\n",
47+
" sys.path.append(os.path.abspath(module_path))\n",
48+
" print(f\"python path: {os.path.abspath(module_path)} is added\")\n",
49+
" else:\n",
50+
" print(f\"python path: {os.path.abspath(module_path)} already exists\")\n",
51+
" print(\"sys.path: \", sys.path)\n",
52+
"\n",
53+
"module_path = \"..\"\n",
54+
"add_python_path(module_path)\n"
55+
]
56+
},
57+
{
58+
"cell_type": "code",
59+
"execution_count": null,
60+
"id": "689e69ed-57fc-44ae-a65d-001930e6e2bb",
61+
"metadata": {
62+
"tags": []
63+
},
64+
"outputs": [],
65+
"source": [
66+
"import boto3\n",
67+
"import uuid\n",
68+
"import botocore\n",
69+
"import time"
70+
]
71+
},
72+
{
73+
"cell_type": "code",
74+
"execution_count": null,
75+
"id": "7107e2ff-6369-4a13-abfb-b56c3955db9c",
76+
"metadata": {
77+
"tags": []
78+
},
79+
"outputs": [],
80+
"source": [
81+
"from search_utils.ssm import parameter_store"
82+
]
83+
},
84+
{
85+
"cell_type": "code",
86+
"execution_count": null,
87+
"id": "69f8f100-d4a2-4bb0-9b27-24bd0009a2f1",
88+
"metadata": {
89+
"tags": []
90+
},
91+
"outputs": [],
92+
"source": [
93+
"DEV = True # True일 경우 1-AZ without standby로 생성, False일 경우 3-AZ with standby. 워크샵 목적일 때는 지나친 과금/리소스 방지를 위해 True로 설정하는 것을 권장\n",
94+
"VERSION = \"2.11\" # OpenSearch Version (예: 2.7 / 2.9 / 2.11)"
95+
]
96+
},
97+
{
98+
"cell_type": "code",
99+
"execution_count": null,
100+
"id": "599f6eff-f129-4e3d-ba90-c2afc340f49d",
101+
"metadata": {
102+
"tags": []
103+
},
104+
"outputs": [],
105+
"source": [
106+
"opensearch_user_id = \"<your id>\" # ex) 'raguser'\n",
107+
"opensearch_user_password = \"<your password>\" # ex) 'MarsEarth1!'\n",
108+
"\n"
109+
]
110+
},
111+
{
112+
"cell_type": "code",
113+
"execution_count": null,
114+
"id": "84141435-57a9-4a60-82b1-c93d4e415e1e",
115+
"metadata": {
116+
"tags": []
117+
},
118+
"outputs": [],
119+
"source": [
120+
"# 0. OpenSearch 인증정보 ssm에 저장하기\n",
121+
"\n",
122+
"region = boto3.Session().region_name\n",
123+
"account_id = boto3.client(\"sts\").get_caller_identity()[\"Account\"]\n",
124+
"opensearch = boto3.client('opensearch', region)\n",
125+
"rand_str = uuid.uuid4().hex[:8]\n",
126+
"domain_name = f'image-search-{rand_str}'\n",
127+
"\n",
128+
"cluster_config_prod = {\n",
129+
" 'InstanceCount': 3,\n",
130+
" 'InstanceType': 'r6g.large.search',\n",
131+
" 'ZoneAwarenessEnabled': True,\n",
132+
" 'DedicatedMasterEnabled': True,\n",
133+
" 'MultiAZWithStandbyEnabled': True,\n",
134+
" 'DedicatedMasterType': 'r6g.large.search',\n",
135+
" 'DedicatedMasterCount': 3\n",
136+
"}\n",
137+
"\n",
138+
"cluster_config_dev = {\n",
139+
" 'InstanceCount': 1,\n",
140+
" 'InstanceType': 'r6g.large.search',\n",
141+
" 'ZoneAwarenessEnabled': False,\n",
142+
" 'DedicatedMasterEnabled': False,\n",
143+
"}\n",
144+
"\n",
145+
"\n",
146+
"ebs_options = {\n",
147+
" 'EBSEnabled': True,\n",
148+
" 'VolumeType': 'gp3',\n",
149+
" 'VolumeSize': 100,\n",
150+
"}\n",
151+
"\n",
152+
"advanced_security_options = {\n",
153+
" 'Enabled': True,\n",
154+
" 'InternalUserDatabaseEnabled': True,\n",
155+
" 'MasterUserOptions': {\n",
156+
" 'MasterUserName': opensearch_user_id,\n",
157+
" 'MasterUserPassword': opensearch_user_password\n",
158+
" }\n",
159+
"}\n",
160+
"\n",
161+
"ap = f'{{\\\"Version\\\":\\\"2012-10-17\\\",\\\"Statement\\\":[{{\\\"Effect\\\":\\\"Allow\\\",\\\"Principal\\\":{{\\\"AWS\\\":\\\"*\\\"}},\\\"Action\\\":\\\"es:*\\\",\\\"Resource\\\":\\\"arn:aws:es:{region}:{account_id}:domain\\/{domain_name}\\/*\\\"}}]}}'\n",
162+
"\n",
163+
"if DEV:\n",
164+
" cluster_config = cluster_config_dev\n",
165+
"else:\n",
166+
" cluster_config = cluster_config_prod\n",
167+
"\n",
168+
"response = opensearch.create_domain(\n",
169+
" DomainName=domain_name,\n",
170+
" EngineVersion=f'OpenSearch_{VERSION}',\n",
171+
" ClusterConfig=cluster_config,\n",
172+
" AccessPolicies=ap,\n",
173+
" EBSOptions=ebs_options,\n",
174+
" AdvancedSecurityOptions=advanced_security_options,\n",
175+
" NodeToNodeEncryptionOptions={'Enabled': True},\n",
176+
" EncryptionAtRestOptions={'Enabled': True},\n",
177+
" DomainEndpointOptions={'EnforceHTTPS': True}\n",
178+
")"
179+
]
180+
},
181+
{
182+
"cell_type": "code",
183+
"execution_count": null,
184+
"id": "d0f85455-908b-415c-92c7-c2e4cebde6fb",
185+
"metadata": {
186+
"tags": []
187+
},
188+
"outputs": [],
189+
"source": [
190+
"%%time\n",
191+
"\n",
192+
"# 1. OpenSearch 설치\n",
193+
"\n",
194+
"def wait_for_domain_creation(domain_name):\n",
195+
" try:\n",
196+
" response = opensearch.describe_domain(\n",
197+
" DomainName=domain_name\n",
198+
" )\n",
199+
" # Every 60 seconds, check whether the domain is processing.\n",
200+
" while 'Endpoint' not in response['DomainStatus']:\n",
201+
" print('Creating Opensearch domain...')\n",
202+
" time.sleep(60)\n",
203+
" response = opensearch.describe_domain(\n",
204+
" DomainName=domain_name)\n",
205+
"\n",
206+
" # Once we exit the loop, the domain is ready for ingestion.\n",
207+
" endpoint = response['DomainStatus']['Endpoint']\n",
208+
" print('Domain endpoint ready to receive data: ' + endpoint)\n",
209+
" except botocore.exceptions.ClientError as error:\n",
210+
" if error.response['Error']['Code'] == 'ResourceNotFoundException':\n",
211+
" print('Domain not found.')\n",
212+
" else:\n",
213+
" raise error\n",
214+
"\n",
215+
"wait_for_domain_creation(domain_name)\n",
216+
"\n",
217+
"response = opensearch.describe_domain(DomainName=domain_name)\n",
218+
"opensearch_domain_endpoint = f\"https://{response['DomainStatus']['Endpoint']}\"\n",
219+
"\n",
220+
"# 2. OpenSearch 인증정보 ssm에 저장하기\n",
221+
"\n",
222+
"region=boto3.Session().region_name\n",
223+
"pm = parameter_store(region)\n",
224+
"\n",
225+
"pm.put_params(\n",
226+
" key=\"opensearch_domain_endpoint\",\n",
227+
" value=f'{opensearch_domain_endpoint}',\n",
228+
" overwrite=True,\n",
229+
" enc=False\n",
230+
")\n",
231+
"\n",
232+
"pm.put_params(\n",
233+
" key=\"opensearch_user_id\",\n",
234+
" value=f'{opensearch_user_id}',\n",
235+
" overwrite=True,\n",
236+
" enc=False\n",
237+
")\n",
238+
"\n",
239+
"pm.put_params(\n",
240+
" key=\"opensearch_user_password\",\n",
241+
" value=f'{opensearch_user_password}',\n",
242+
" overwrite=True,\n",
243+
" enc=True\n",
244+
")\n",
245+
"\n",
246+
"# 3. 한국어 분석을 위한 노리(Nori) 플러그인 설치\n",
247+
"\n",
248+
"nori_pkg_id = {}\n",
249+
"nori_pkg_id['us-east-1'] = {\n",
250+
" '2.3': 'G196105221',\n",
251+
" '2.5': 'G240285063',\n",
252+
" '2.7': 'G16029449', \n",
253+
" '2.9': 'G60209291',\n",
254+
" '2.11': 'G181660338'\n",
255+
"}\n",
256+
"\n",
257+
"nori_pkg_id['us-west-2'] = {\n",
258+
" '2.3': 'G94047474',\n",
259+
" '2.5': 'G138227316',\n",
260+
" '2.7': 'G182407158', \n",
261+
" '2.9': 'G226587000',\n",
262+
" '2.11': 'G79602591'\n",
263+
"}\n",
264+
"\n",
265+
"pkg_response = opensearch.associate_package(\n",
266+
" PackageID=nori_pkg_id[region][VERSION], # nori plugin\n",
267+
" DomainName=domain_name\n",
268+
")\n",
269+
"\n",
270+
"def wait_for_associate_package(domain_name, max_results=1):\n",
271+
"\n",
272+
" response = opensearch.list_packages_for_domain(\n",
273+
" DomainName=domain_name,\n",
274+
" MaxResults=1\n",
275+
" )\n",
276+
" # Every 60 seconds, check whether the domain is processing.\n",
277+
" while response['DomainPackageDetailsList'][0]['DomainPackageStatus'] == \"ASSOCIATING\":\n",
278+
" print('Associating packages...')\n",
279+
" time.sleep(60)\n",
280+
" response = opensearch.list_packages_for_domain(\n",
281+
" DomainName=domain_name,\n",
282+
" MaxResults=1\n",
283+
" )\n",
284+
"\n",
285+
" print('Nori Plugin Associated!')\n",
286+
"\n",
287+
"wait_for_associate_package(domain_name)"
288+
]
289+
},
290+
{
291+
"cell_type": "code",
292+
"execution_count": null,
293+
"id": "f6beecf3-d556-4fcd-af9c-ec327e0a63e9",
294+
"metadata": {
295+
"tags": []
296+
},
297+
"outputs": [],
298+
"source": [
299+
"print (pm.get_params(key=\"opensearch_domain_endpoint\", enc=False))\n",
300+
"print (pm.get_params(key=\"opensearch_user_id\", enc=False))\n",
301+
"print (pm.get_params(key=\"opensearch_user_password\", enc=True))"
302+
]
303+
},
304+
{
305+
"cell_type": "code",
306+
"execution_count": null,
307+
"id": "fc7aba1f-2446-4a01-8546-01ba65f91c2c",
308+
"metadata": {},
309+
"outputs": [],
310+
"source": []
311+
},
312+
{
313+
"cell_type": "code",
314+
"execution_count": null,
315+
"id": "73f37ef4-2038-4cb2-b976-59d91a47937f",
316+
"metadata": {},
317+
"outputs": [],
318+
"source": []
319+
}
320+
],
321+
"metadata": {
322+
"kernelspec": {
323+
"display_name": "base",
324+
"language": "python",
325+
"name": "python3"
326+
},
327+
"language_info": {
328+
"codemirror_mode": {
329+
"name": "ipython",
330+
"version": 3
331+
},
332+
"file_extension": ".py",
333+
"mimetype": "text/x-python",
334+
"name": "python",
335+
"nbconvert_exporter": "python",
336+
"pygments_lexer": "ipython3",
337+
"version": "3.10.13"
338+
}
339+
},
340+
"nbformat": 4,
341+
"nbformat_minor": 5
342+
}

0 commit comments

Comments
 (0)