From 9b199b8aadc51a1631542c1c5d67d8aa09a31828 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 3 Nov 2025 10:44:56 -0800 Subject: [PATCH 01/30] Convert prepdocs to skills --- AGENTS.md | 14 +- app/backend/app.py | 3 +- app/backend/prepdocs.py | 297 +++---- .../prepdocslib/cloudingestionstrategy.py | 304 +++++++ app/backend/prepdocslib/figureprocessor.py | 160 ++++ app/backend/prepdocslib/filestrategy.py | 79 +- app/backend/prepdocslib/ingestionhelpers.py | 91 ++ app/backend/prepdocslib/page.py | 86 +- app/backend/prepdocslib/pdfparser.py | 106 +-- app/backend/prepdocslib/servicesetup.py | 202 +++++ app/backend/prepdocslib/textprocessor.py | 70 ++ app/functions/document_extractor/.funcignore | 11 + .../document_extractor/function_app.py | 210 +++++ app/functions/document_extractor/host.json | 20 + .../document_extractor/requirements.txt | 9 + app/functions/figure_processor/.funcignore | 11 + .../figure_processor/function_app.py | 174 ++++ app/functions/figure_processor/host.json | 20 + .../figure_processor/requirements.txt | 7 + app/functions/text_processor/.funcignore | 11 + app/functions/text_processor/function_app.py | 231 ++++++ app/functions/text_processor/host.json | 20 + app/functions/text_processor/requirements.txt | 9 + azure.yaml | 12 + docs/cloud_ingestion.md | 775 ++++++++++++++++++ infra/app/functions-app.bicep | 97 +++ infra/app/functions-rbac.bicep | 130 +++ infra/app/functions.bicep | 291 +++++++ infra/main.bicep | 41 + infra/main.parameters.json | 3 + tests/test_mediadescriber.py | 2 +- tests/test_pdfparser.py | 143 ++-- 32 files changed, 3261 insertions(+), 378 deletions(-) create mode 100644 app/backend/prepdocslib/cloudingestionstrategy.py create mode 100644 app/backend/prepdocslib/figureprocessor.py create mode 100644 app/backend/prepdocslib/ingestionhelpers.py create mode 100644 app/backend/prepdocslib/servicesetup.py create mode 100644 app/backend/prepdocslib/textprocessor.py create mode 100644 app/functions/document_extractor/.funcignore create mode 100644 app/functions/document_extractor/function_app.py create mode 100644 app/functions/document_extractor/host.json create mode 100644 app/functions/document_extractor/requirements.txt create mode 100644 app/functions/figure_processor/.funcignore create mode 100644 app/functions/figure_processor/function_app.py create mode 100644 app/functions/figure_processor/host.json create mode 100644 app/functions/figure_processor/requirements.txt create mode 100644 app/functions/text_processor/.funcignore create mode 100644 app/functions/text_processor/function_app.py create mode 100644 app/functions/text_processor/host.json create mode 100644 app/functions/text_processor/requirements.txt create mode 100644 docs/cloud_ingestion.md create mode 100644 infra/app/functions-app.bicep create mode 100644 infra/app/functions-rbac.bicep create mode 100644 infra/app/functions.bicep diff --git a/AGENTS.md b/AGENTS.md index 0021d98852..c3b0a4191d 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,6 +17,9 @@ If necessary, edit this file to ensure it accurately reflects the current state * app/backend/approaches/prompts/chat_query_rewrite.prompty: Prompt used to rewrite the query based off search history into a better search query * app/backend/approaches/prompts/chat_query_rewrite_tools.json: Tools used by the query rewriting prompt * app/backend/approaches/prompts/chat_answer_question.prompty: Prompt used by the Chat approach to actually answer the question based off sources + * app/backend/prepdocslib/cloudingestionstrategy.py: Builds the Azure AI Search indexer and skillset for the cloud ingestion pipeline + * app/backend/prepdocslib/pdfparser.py: Uses Azure Document Intelligence to emit page text plus figure placeholders + * app/backend/prepdocslib/figureprocessor.py: Shared helper that generates figure descriptions for both local ingestion and the cloud figure-processor skill * app/backend/app.py: The main entry point for the backend application. * app/frontend: Contains the React frontend code, built with TypeScript, built with vite. * app/frontend/src/api: Contains the API client code for communicating with the backend. @@ -49,6 +52,15 @@ When adding new azd environment variables, update: 1. .azdo/pipelines/azure-dev.yml: Add the new environment variable under `env` section 1. .github/workflows/azure-dev.yml: Add the new environment variable under `env` section +For cloud ingestion, `prepdocs.py --use-cloud-ingestion` expects the function endpoints and managed identity resource IDs in the azd environment. The search service must have a system- or user-assigned managed identity with access to the Azure Functions app: + +* `DOCUMENT_EXTRACTOR_SKILL_ENDPOINT` +* `DOCUMENT_EXTRACTOR_SKILL_RESOURCE_ID` +* `FIGURE_PROCESSOR_SKILL_ENDPOINT` +* `FIGURE_PROCESSOR_SKILL_RESOURCE_ID` +* `TEXT_PROCESSOR_SKILL_ENDPOINT` +* `TEXT_PROCESSOR_SKILL_RESOURCE_ID` + ## Adding a new setting to "Developer Settings" in RAG app When adding a new developer setting, update: @@ -65,7 +77,7 @@ When adding a new developer setting, update: * app/backend/approaches/retrievethenread.py : Retrieve from overrides parameter * app/backend/app.py: Some settings may need to be sent down in the /config route. -## When adding tests for a new feature: +## When adding tests for a new feature All tests are in the `tests` folder and use the pytest framework. There are three styles of tests: diff --git a/app/backend/app.py b/app/backend/app.py index ae38e70b12..4a1b751a9e 100644 --- a/app/backend/app.py +++ b/app/backend/app.py @@ -586,7 +586,7 @@ async def setup_clients(): current_app.config[CONFIG_USER_BLOB_MANAGER] = user_blob_manager # Set up ingester - file_processors = setup_file_processors( + file_processors, figure_processor = setup_file_processors( azure_credential=azure_credential, document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"), local_pdf_parser=os.getenv("USE_LOCAL_PDF_PARSER", "").lower() == "true", @@ -627,6 +627,7 @@ async def setup_clients(): image_embeddings=image_embeddings_service, search_field_name_embedding=AZURE_SEARCH_FIELD_NAME_EMBEDDING, blob_manager=user_blob_manager, + figure_processor=figure_processor, ) current_app.config[CONFIG_INGESTER] = ingester diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 1e8bd9a10d..6770ca310a 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -2,27 +2,22 @@ import asyncio import logging import os -from enum import Enum -from typing import Optional, Union +from typing import Union import aiohttp from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential -from azure.identity.aio import AzureDeveloperCliCredential, get_bearer_token_provider -from openai import AsyncAzureOpenAI, AsyncOpenAI +from azure.identity.aio import AzureDeveloperCliCredential +from openai import AsyncOpenAI from rich.logging import RichHandler from load_azd_env import load_azd_env -from prepdocslib.blobmanager import BlobManager +from prepdocslib.cloudingestionstrategy import CloudIngestionStrategy from prepdocslib.csvparser import CsvParser -from prepdocslib.embeddings import ( - AzureOpenAIEmbeddingService, - ImageEmbeddings, - OpenAIEmbeddingService, -) +from prepdocslib.embeddings import AzureOpenAIEmbeddingService, OpenAIEmbeddingService from prepdocslib.fileprocessor import FileProcessor from prepdocslib.filestrategy import FileStrategy -from prepdocslib.htmlparser import LocalHTMLParser +from prepdocslib.ingestionhelpers import select_parser from prepdocslib.integratedvectorizerstrategy import ( IntegratedVectorizerStrategy, ) @@ -33,11 +28,15 @@ LocalListFileStrategy, ) from prepdocslib.parser import Parser -from prepdocslib.pdfparser import ( - DocumentAnalysisParser, - LocalPdfParser, - MediaDescriptionStrategy, +from prepdocslib.servicesetup import ( + OpenAIHost, + setup_blob_manager, + setup_figure_processor, + setup_image_embeddings_service, + setup_openai_client, ) + +# Removed direct pdf parser imports (selection now via select_parser) from prepdocslib.strategy import DocumentAction, SearchInfo, Strategy from prepdocslib.textparser import TextParser from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter @@ -52,6 +51,15 @@ def clean_key_if_exists(key: Union[str, None]) -> Union[str, None]: return None +def require_env_var(name: str) -> str: + """Fetch an environment variable or raise a helpful error if it is missing.""" + + value = os.getenv(name) + if not value: + raise ValueError(f"Environment variable {name} must be set to use cloud ingestion.") + return value + + async def check_search_service_connectivity(search_service: str) -> bool: """Check if the search service is accessible by hitting the /ping endpoint.""" ping_url = f"https://{search_service}.search.windows.net/ping" @@ -98,28 +106,6 @@ async def setup_search_info( ) -def setup_blob_manager( - azure_credential: AsyncTokenCredential, - storage_account: str, - storage_container: str, - storage_resource_group: str, - subscription_id: str, - storage_key: Union[str, None] = None, - image_storage_container: Union[str, None] = None, # Added this parameter -): - storage_creds: Union[AsyncTokenCredential, str] = azure_credential if storage_key is None else storage_key - - return BlobManager( - endpoint=f"https://{storage_account}.blob.core.windows.net", - container=storage_container, - account=storage_account, - credential=storage_creds, - resource_group=storage_resource_group, - subscription_id=subscription_id, - image_container=image_storage_container, - ) - - def setup_list_file_strategy( azure_credential: AsyncTokenCredential, local_files: Union[str, None], @@ -152,13 +138,6 @@ def setup_list_file_strategy( return list_file_strategy -class OpenAIHost(str, Enum): - OPENAI = "openai" - AZURE = "azure" - AZURE_CUSTOM = "azure_custom" - LOCAL = "local" - - def setup_embeddings_service( azure_credential: AsyncTokenCredential, openai_host: OpenAIHost, @@ -204,64 +183,6 @@ def setup_embeddings_service( ) -def setup_openai_client( - openai_host: OpenAIHost, - azure_credential: AsyncTokenCredential, - azure_openai_api_key: Union[str, None] = None, - azure_openai_api_version: Union[str, None] = None, - azure_openai_service: Union[str, None] = None, - azure_openai_custom_url: Union[str, None] = None, - openai_api_key: Union[str, None] = None, - openai_organization: Union[str, None] = None, -): - if openai_host not in OpenAIHost: - raise ValueError(f"Invalid OPENAI_HOST value: {openai_host}. Must be one of {[h.value for h in OpenAIHost]}.") - - openai_client: AsyncOpenAI - - if openai_host in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: - if openai_host == OpenAIHost.AZURE_CUSTOM: - logger.info("OPENAI_HOST is azure_custom, setting up Azure OpenAI custom client") - if not azure_openai_custom_url: - raise ValueError("AZURE_OPENAI_CUSTOM_URL must be set when OPENAI_HOST is azure_custom") - endpoint = azure_openai_custom_url - else: - logger.info("OPENAI_HOST is azure, setting up Azure OpenAI client") - if not azure_openai_service: - raise ValueError("AZURE_OPENAI_SERVICE must be set when OPENAI_HOST is azure") - endpoint = f"https://{azure_openai_service}.openai.azure.com" - if azure_openai_api_key: - logger.info("AZURE_OPENAI_API_KEY_OVERRIDE found, using as api_key for Azure OpenAI client") - openai_client = AsyncAzureOpenAI( - api_version=azure_openai_api_version, azure_endpoint=endpoint, api_key=azure_openai_api_key - ) - else: - logger.info("Using Azure credential (passwordless authentication) for Azure OpenAI client") - token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default") - openai_client = AsyncAzureOpenAI( - api_version=azure_openai_api_version, - azure_endpoint=endpoint, - azure_ad_token_provider=token_provider, - ) - elif openai_host == OpenAIHost.LOCAL: - logger.info("OPENAI_HOST is local, setting up local OpenAI client for OPENAI_BASE_URL with no key") - openai_client = AsyncOpenAI( - base_url=os.environ["OPENAI_BASE_URL"], - api_key="no-key-required", - ) - else: - logger.info( - "OPENAI_HOST is not azure, setting up OpenAI client using OPENAI_API_KEY and OPENAI_ORGANIZATION environment variables" - ) - if openai_api_key is None: - raise ValueError("OpenAI key is required when using the non-Azure OpenAI API") - openai_client = AsyncOpenAI( - api_key=openai_api_key, - organization=openai_organization, - ) - return openai_client - - def setup_file_processors( azure_credential: AsyncTokenCredential, document_intelligence_service: Union[str, None], @@ -277,45 +198,44 @@ def setup_file_processors( ): sentence_text_splitter = SentenceTextSplitter() - doc_int_parser: Optional[DocumentAnalysisParser] = None - # check if Azure Document Intelligence credentials are provided - if document_intelligence_service is not None: - documentintelligence_creds: Union[AsyncTokenCredential, AzureKeyCredential] = ( - azure_credential if document_intelligence_key is None else AzureKeyCredential(document_intelligence_key) - ) - doc_int_parser = DocumentAnalysisParser( - endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", - credential=documentintelligence_creds, - media_description_strategy=( - MediaDescriptionStrategy.OPENAI - if use_multimodal - else ( - MediaDescriptionStrategy.CONTENTUNDERSTANDING - if use_content_understanding - else MediaDescriptionStrategy.NONE - ) - ), - openai_client=openai_client, - openai_model=openai_model, - openai_deployment=openai_deployment, - content_understanding_endpoint=content_understanding_endpoint, - ) - - pdf_parser: Optional[Parser] = None - if local_pdf_parser or document_intelligence_service is None: - pdf_parser = LocalPdfParser() - elif document_intelligence_service is not None: - pdf_parser = doc_int_parser - else: - logger.warning("No PDF parser available") - - html_parser: Optional[Parser] = None - if local_html_parser or document_intelligence_service is None: - html_parser = LocalHTMLParser() - elif document_intelligence_service is not None: - html_parser = doc_int_parser - else: - logger.warning("No HTML parser available") + # Build mapping of file extensions to parsers using shared select_parser helper. + # Each select attempt may instantiate a DI parser; duplication is acceptable at startup. + def _try_select(ext: str, content_type: str) -> Parser | None: + file_name = f"dummy{ext}" + try: + return select_parser( + file_name=file_name, + content_type=content_type, + azure_credential=azure_credential, + document_intelligence_service=document_intelligence_service, + document_intelligence_key=document_intelligence_key, + process_figures=use_multimodal, + use_local_pdf_parser=local_pdf_parser, + use_local_html_parser=local_html_parser, + ) + except ValueError: + return None + + pdf_parser: Parser | None = _try_select(".pdf", "application/pdf") + html_parser: Parser | None = _try_select(".html", "text/html") + + # DI-only formats + di_exts = [ + ".docx", + ".pptx", + ".xlsx", + ".png", + ".jpg", + ".jpeg", + ".tiff", + ".bmp", + ".heic", + ] + di_parsers: dict[str, Parser] = {} + for ext in di_exts: + parser = _try_select(ext, "application/octet-stream") + if parser is not None: + di_parsers[ext] = parser # These file formats can always be parsed: file_processors = { @@ -326,39 +246,22 @@ def setup_file_processors( } # These require either a Python package or Document Intelligence if pdf_parser is not None: - file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)}) + file_processors[".pdf"] = FileProcessor(pdf_parser, sentence_text_splitter) if html_parser is not None: - file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)}) - # These file formats require Document Intelligence - if doc_int_parser is not None: - file_processors.update( - { - ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), - ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), - ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter), - ".png": FileProcessor(doc_int_parser, sentence_text_splitter), - ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter), - ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter), - ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter), - ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter), - ".heic": FileProcessor(doc_int_parser, sentence_text_splitter), - } - ) - return file_processors - - -def setup_image_embeddings_service( - azure_credential: AsyncTokenCredential, vision_endpoint: Union[str, None], use_multimodal: bool -) -> Union[ImageEmbeddings, None]: - image_embeddings_service: Optional[ImageEmbeddings] = None - if use_multimodal: - if vision_endpoint is None: - raise ValueError("An Azure AI Vision endpoint must be provided to use multimodal features.") - image_embeddings_service = ImageEmbeddings( - endpoint=vision_endpoint, - token_provider=get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default"), - ) - return image_embeddings_service + file_processors[".html"] = FileProcessor(html_parser, sentence_text_splitter) + for ext, parser in di_parsers.items(): + file_processors[ext] = FileProcessor(parser, sentence_text_splitter) + figure_processor = setup_figure_processor( + credential=azure_credential, + use_multimodal=use_multimodal, + use_content_understanding=use_content_understanding, + content_understanding_endpoint=content_understanding_endpoint, + openai_client=openai_client, + openai_model=openai_model, + openai_deployment=openai_deployment, + ) + + return file_processors, figure_processor async def main(strategy: Strategy, setup_index: bool = True): @@ -383,6 +286,11 @@ async def main(strategy: Strategy, setup_index: bool = True): parser.add_argument( "--disablebatchvectors", action="store_true", help="Don't compute embeddings in batch for the sections" ) + parser.add_argument( + "--use-cloud-ingestion", + action="store_true", + help="Use Azure AI Search indexer with cloud-hosted custom skills instead of local ingestion", + ) parser.add_argument( "--remove", action="store_true", @@ -437,6 +345,7 @@ async def main(strategy: Strategy, setup_index: bool = True): use_acls = os.getenv("AZURE_USE_AUTHENTICATION", "").lower() == "true" enforce_access_control = os.getenv("AZURE_ENFORCE_ACCESS_CONTROL", "").lower() == "true" enable_global_documents = os.getenv("AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS", "").lower() == "true" + use_cloud_ingestion = args.use_cloud_ingestion or os.getenv("USE_CLOUD_INGESTION", "").lower() == "true" dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false" use_agentic_retrieval = os.getenv("USE_AGENTIC_RETRIEVAL", "").lower() == "true" use_content_understanding = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "").lower() == "true" @@ -548,7 +457,42 @@ async def main(strategy: Strategy, setup_index: bool = True): ) ingestion_strategy: Strategy - if use_int_vectorization: + if use_cloud_ingestion: + if args.category: + logger.warning("Category assignment is not currently supported with cloud ingestion; ignoring.") + if dont_use_vectors: + raise ValueError("USE_VECTORS must remain true when using cloud ingestion.") + if not openai_embeddings_service or not isinstance(openai_embeddings_service, AzureOpenAIEmbeddingService): + raise ValueError("Cloud ingestion requires Azure OpenAI embeddings to configure the search index.") + + document_extractor_uri = require_env_var("DOCUMENT_EXTRACTOR_SKILL_ENDPOINT") + document_extractor_resource_id = require_env_var("DOCUMENT_EXTRACTOR_SKILL_RESOURCE_ID") + figure_processor_uri = require_env_var("FIGURE_PROCESSOR_SKILL_ENDPOINT") + figure_processor_resource_id = require_env_var("FIGURE_PROCESSOR_SKILL_RESOURCE_ID") + text_processor_uri = require_env_var("TEXT_PROCESSOR_SKILL_ENDPOINT") + text_processor_resource_id = require_env_var("TEXT_PROCESSOR_SKILL_RESOURCE_ID") + search_embedding_field = require_env_var("AZURE_SEARCH_FIELD_NAME_EMBEDDING") + + ingestion_strategy = CloudIngestionStrategy( + list_file_strategy=list_file_strategy, + blob_manager=blob_manager, + search_info=search_info, + embeddings=openai_embeddings_service, + search_field_name_embedding=search_embedding_field, + document_extractor_uri=document_extractor_uri, + document_extractor_auth_resource_id=document_extractor_resource_id, + figure_processor_uri=figure_processor_uri, + figure_processor_auth_resource_id=figure_processor_resource_id, + text_processor_uri=text_processor_uri, + text_processor_auth_resource_id=text_processor_resource_id, + subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], + document_action=document_action, + search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), + use_acls=use_acls, + use_multimodal=use_multimodal, + enforce_access_control=enforce_access_control, + ) + elif use_int_vectorization: if not openai_embeddings_service or not isinstance(openai_embeddings_service, AzureOpenAIEmbeddingService): raise Exception("Integrated vectorization strategy requires an Azure OpenAI embeddings service") @@ -567,7 +511,7 @@ async def main(strategy: Strategy, setup_index: bool = True): enforce_access_control=enforce_access_control, ) else: - file_processors = setup_file_processors( + file_processors, figure_processor = setup_file_processors( azure_credential=azd_credential, document_intelligence_service=os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE"), document_intelligence_key=clean_key_if_exists(args.documentintelligencekey), @@ -600,8 +544,7 @@ async def main(strategy: Strategy, setup_index: bool = True): search_field_name_embedding=os.getenv("AZURE_SEARCH_FIELD_NAME_EMBEDDING", "embedding"), use_acls=use_acls, category=args.category, - use_content_understanding=use_content_understanding, - content_understanding_endpoint=os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT"), + figure_processor=figure_processor, enforce_access_control=enforce_access_control, ) diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py new file mode 100644 index 0000000000..7139ee6412 --- /dev/null +++ b/app/backend/prepdocslib/cloudingestionstrategy.py @@ -0,0 +1,304 @@ +"""Cloud ingestion strategy using Azure AI Search custom skills.""" + +from __future__ import annotations + +import logging +from dataclasses import dataclass +from datetime import timedelta + +from azure.search.documents.indexes._generated.models import ( + NativeBlobSoftDeleteDeletionDetectionPolicy, +) +from azure.search.documents.indexes.models import ( + IndexProjectionMode, + InputFieldMappingEntry, + OutputFieldMappingEntry, + SearchIndexer, + SearchIndexerDataContainer, + SearchIndexerDataSourceConnection, + SearchIndexerDataSourceType, + SearchIndexerIndexProjection, + SearchIndexerIndexProjectionSelector, + SearchIndexerIndexProjectionsParameters, + SearchIndexerSkillset, + WebApiSkill, +) + +from .blobmanager import BlobManager +from .embeddings import AzureOpenAIEmbeddingService +from .listfilestrategy import ListFileStrategy +from .searchmanager import SearchManager +from .strategy import DocumentAction, SearchInfo, Strategy + +logger = logging.getLogger("scripts") + +_DEFAULT_TIMEOUT = timedelta(seconds=230) +_DEFAULT_BATCH_SIZE = 1 + + +@dataclass(slots=True) +class _SkillConfig: + """Configuration for a custom Web API skill.""" + + name: str + description: str + uri: str + auth_resource_id: str + + +class CloudIngestionStrategy(Strategy): + """Ingestion strategy that wires Azure Function custom skills into an indexer.""" + + def __init__( + self, + *, + list_file_strategy: ListFileStrategy, + blob_manager: BlobManager, + search_info: SearchInfo, + embeddings: AzureOpenAIEmbeddingService, + search_field_name_embedding: str, + document_extractor_uri: str, + document_extractor_auth_resource_id: str, + figure_processor_uri: str, + figure_processor_auth_resource_id: str, + text_processor_uri: str, + text_processor_auth_resource_id: str, + subscription_id: str, + document_action: DocumentAction = DocumentAction.Add, + search_analyzer_name: str | None = None, + use_acls: bool = False, + use_multimodal: bool = False, + enforce_access_control: bool = False, + ) -> None: + if not search_field_name_embedding: + raise ValueError("search_field_name_embedding must be provided for cloud ingestion") + if not document_extractor_uri: + raise ValueError("document_extractor_uri must be provided for cloud ingestion") + if not document_extractor_auth_resource_id: + raise ValueError("document_extractor_auth_resource_id must be provided for cloud ingestion") + if not figure_processor_uri: + raise ValueError("figure_processor_uri must be provided for cloud ingestion") + if not figure_processor_auth_resource_id: + raise ValueError("figure_processor_auth_resource_id must be provided for cloud ingestion") + if not text_processor_uri: + raise ValueError("text_processor_uri must be provided for cloud ingestion") + if not text_processor_auth_resource_id: + raise ValueError("text_processor_auth_resource_id must be provided for cloud ingestion") + + self.list_file_strategy = list_file_strategy + self.blob_manager = blob_manager + self.document_action = document_action + self.embeddings = embeddings + self.search_field_name_embedding = search_field_name_embedding + self.search_info = search_info + self.search_analyzer_name = search_analyzer_name + self.use_acls = use_acls + self.use_multimodal = use_multimodal + self.enforce_access_control = enforce_access_control + self.subscription_id = subscription_id + + prefix = f"{self.search_info.index_name}-cloud" + self.skillset_name = f"{prefix}-skillset" + self.indexer_name = f"{prefix}-indexer" + self.data_source_name = f"{prefix}-blob" + + self.document_extractor = _SkillConfig( + name=f"{prefix}-document-extractor-skill", + description="Custom skill that downloads and parses source documents", + uri=document_extractor_uri, + auth_resource_id=document_extractor_auth_resource_id, + ) + self.figure_processor = _SkillConfig( + name=f"{prefix}-figure-processor-skill", + description="Custom skill that enriches individual figures", + uri=figure_processor_uri, + auth_resource_id=figure_processor_auth_resource_id, + ) + self.text_processor = _SkillConfig( + name=f"{prefix}-text-processor-skill", + description="Custom skill that merges figures, chunks text, and generates embeddings", + uri=text_processor_uri, + auth_resource_id=text_processor_auth_resource_id, + ) + + self._search_manager: SearchManager | None = None + + def _build_search_manager(self) -> SearchManager: + if not isinstance(self.embeddings, AzureOpenAIEmbeddingService): + raise TypeError("Cloud ingestion requires AzureOpenAIEmbeddingService for search index setup") + + return SearchManager( + search_info=self.search_info, + search_analyzer_name=self.search_analyzer_name, + use_acls=self.use_acls, + use_int_vectorization=True, + embeddings=self.embeddings, + field_name_embedding=self.search_field_name_embedding, + search_images=self.use_multimodal, + enforce_access_control=self.enforce_access_control, + ) + + def _build_document_extractor_skill(self) -> WebApiSkill: + outputs = [ + OutputFieldMappingEntry(name="pages", target_name="pages"), + OutputFieldMappingEntry(name="figures", target_name="figures"), + ] + + return WebApiSkill( + name=self.document_extractor.name, + description=self.document_extractor.description, + context="/document", + uri=self.document_extractor.uri, + http_method="POST", + timeout=_DEFAULT_TIMEOUT, + batch_size=_DEFAULT_BATCH_SIZE, + degree_of_parallelism=1, + # Managed identity: Search service authenticates against the function app using this resource ID. + auth_resource_id=self.document_extractor.auth_resource_id, + inputs=[ + InputFieldMappingEntry(name="blobUrl", source="/document/metadata_storage_path"), + InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"), + InputFieldMappingEntry(name="content_type", source="/document/metadata_storage_content_type"), + InputFieldMappingEntry( + name="metadata_storage_sas_token", source="/document/metadata_storage_sas_token" + ), + ], + outputs=outputs, + ) + + def _build_figure_processor_skill(self) -> WebApiSkill: + inputs = [ + InputFieldMappingEntry(name="figure_id", source="/document/figures/*/figure_id"), + InputFieldMappingEntry(name="document_file_name", source="/document/figures/*/document_file_name"), + InputFieldMappingEntry(name="filename", source="/document/figures/*/filename"), + InputFieldMappingEntry(name="mime_type", source="/document/figures/*/mime_type"), + InputFieldMappingEntry(name="bytes_base64", source="/document/figures/*/bytes_base64"), + InputFieldMappingEntry(name="page_num", source="/document/figures/*/page_num"), + InputFieldMappingEntry(name="bbox", source="/document/figures/*/bbox"), + ] + outputs = [ + OutputFieldMappingEntry(name="caption", target_name="caption"), + OutputFieldMappingEntry(name="url", target_name="url"), + ] + if self.use_multimodal: + outputs.append(OutputFieldMappingEntry(name="imageEmbedding", target_name="imageEmbedding")) + + return WebApiSkill( + name=self.figure_processor.name, + description=self.figure_processor.description, + context="/document/figures/*", + uri=self.figure_processor.uri, + http_method="POST", + timeout=_DEFAULT_TIMEOUT, + batch_size=_DEFAULT_BATCH_SIZE, + degree_of_parallelism=1, + # Managed identity: Search service authenticates against the function app using this resource ID. + auth_resource_id=self.figure_processor.auth_resource_id, + inputs=inputs, + outputs=outputs, + ) + + def _build_text_processor_skill(self) -> WebApiSkill: + inputs = [ + InputFieldMappingEntry(name="pages", source="/document/pages"), + InputFieldMappingEntry(name="figures", source="/document/figures"), + InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"), + InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), + ] + + return WebApiSkill( + name=self.text_processor.name, + description=self.text_processor.description, + context="/document", + uri=self.text_processor.uri, + http_method="POST", + timeout=_DEFAULT_TIMEOUT, + batch_size=_DEFAULT_BATCH_SIZE, + degree_of_parallelism=1, + # Managed identity: Search service authenticates against the function app using this resource ID. + auth_resource_id=self.text_processor.auth_resource_id, + inputs=inputs, + outputs=[OutputFieldMappingEntry(name="chunks", target_name="chunks")], + ) + + def _build_skillset(self) -> SearchIndexerSkillset: + mappings = [ + InputFieldMappingEntry(name="id", source="/document/chunks/*/id"), + InputFieldMappingEntry(name="content", source="/document/chunks/*/content"), + InputFieldMappingEntry(name="sourcepage", source="/document/chunks/*/sourcepage"), + InputFieldMappingEntry(name="sourcefile", source="/document/chunks/*/sourcefile"), + InputFieldMappingEntry(name=self.search_field_name_embedding, source="/document/chunks/*/embedding"), + InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), + ] + + index_projection = SearchIndexerIndexProjection( + selectors=[ + SearchIndexerIndexProjectionSelector( + target_index_name=self.search_info.index_name, + parent_key_field_name="parent_id", + source_context="/document/chunks/*", + mappings=mappings, + ) + ], + parameters=SearchIndexerIndexProjectionsParameters( + projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS, + ), + ) + + return SearchIndexerSkillset( + name=self.skillset_name, + description="Skillset linking document extraction, figure enrichment, and text processing functions", + skills=[ + self._build_document_extractor_skill(), + self._build_figure_processor_skill(), + self._build_text_processor_skill(), + ], + index_projection=index_projection, + ) + + async def setup(self) -> None: + logger.info("Setting up search index and skillset for cloud ingestion") + self._search_manager = self._build_search_manager() + await self._search_manager.create_index() + + async with self.search_info.create_search_indexer_client() as indexer_client: + data_source_connection = SearchIndexerDataSourceConnection( + name=self.data_source_name, + type=SearchIndexerDataSourceType.AZURE_BLOB, + connection_string=self.blob_manager.get_managedidentity_connectionstring(), + container=SearchIndexerDataContainer(name=self.blob_manager.container), + data_deletion_detection_policy=NativeBlobSoftDeleteDeletionDetectionPolicy(), + ) + await indexer_client.create_or_update_data_source_connection(data_source_connection) + + skillset = self._build_skillset() + await indexer_client.create_or_update_skillset(skillset) + + async def run(self) -> None: + if self.document_action == DocumentAction.Add: + files = self.list_file_strategy.list() + async for file in files: + try: + await self.blob_manager.upload_blob(file) + finally: + if file: + file.close() + elif self.document_action == DocumentAction.Remove: + paths = self.list_file_strategy.list_paths() + async for path in paths: + await self.blob_manager.remove_blob(path) + elif self.document_action == DocumentAction.RemoveAll: + await self.blob_manager.remove_blob() + + indexer = SearchIndexer( + name=self.indexer_name, + description="Indexer orchestrating cloud ingestion pipeline", + data_source_name=self.data_source_name, + target_index_name=self.search_info.index_name, + skillset_name=self.skillset_name, + ) + + async with self.search_info.create_search_indexer_client() as indexer_client: + await indexer_client.create_or_update_indexer(indexer) + await indexer_client.run_indexer(self.indexer_name) + logger.info("Triggered indexer '%s' for cloud ingestion", self.indexer_name) diff --git a/app/backend/prepdocslib/figureprocessor.py b/app/backend/prepdocslib/figureprocessor.py new file mode 100644 index 0000000000..f2dfd2ae8a --- /dev/null +++ b/app/backend/prepdocslib/figureprocessor.py @@ -0,0 +1,160 @@ +"""Utilities for describing and enriching figures outside of document parsing.""" + +import logging +from enum import Enum +from typing import TYPE_CHECKING, Any, Optional + +from azure.core.credentials import AzureKeyCredential +from azure.core.credentials_async import AsyncTokenCredential + +from .mediadescriber import ( + ContentUnderstandingDescriber, + MediaDescriber, + MultimodalModelDescriber, +) + +if TYPE_CHECKING: # pragma: no cover - used only for type hints + from .blobmanager import BaseBlobManager + from .embeddings import ImageEmbeddings + from .page import ImageOnPage + +logger = logging.getLogger("scripts") + + +class MediaDescriptionStrategy(Enum): + """Supported mechanisms for describing images extracted from documents.""" + + NONE = "none" + OPENAI = "openai" + CONTENTUNDERSTANDING = "content_understanding" + + +class FigureProcessor: + """Helper that lazily creates a media describer and captions figures on demand.""" + + def __init__( + self, + *, + credential: AsyncTokenCredential | AzureKeyCredential | None = None, + strategy: MediaDescriptionStrategy = MediaDescriptionStrategy.NONE, + openai_client: Any | None = None, + openai_model: str | None = None, + openai_deployment: str | None = None, + content_understanding_endpoint: str | None = None, + ) -> None: + self._credential = credential + self._strategy = strategy + self._openai_client = openai_client + self._openai_model = openai_model + self._openai_deployment = openai_deployment + self._content_understanding_endpoint = content_understanding_endpoint + self._media_describer: MediaDescriber | None = None + self._content_understanding_ready = False + + @property + def strategy(self) -> MediaDescriptionStrategy: + return self._strategy + + async def get_media_describer(self) -> MediaDescriber | None: + """Return (and lazily create) the media describer for this processor.""" + + if self._strategy == MediaDescriptionStrategy.NONE: + return None + + if self._media_describer is not None: + return self._media_describer + + if self._strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: + if self._content_understanding_endpoint is None: + raise ValueError("Content Understanding strategy requires an endpoint") + if self._credential is None: + raise ValueError("Content Understanding strategy requires a credential") + if isinstance(self._credential, AzureKeyCredential): + raise ValueError( + "Content Understanding does not support key credentials; provide a token credential instead" + ) + self._media_describer = ContentUnderstandingDescriber( + self._content_understanding_endpoint, self._credential + ) + return self._media_describer + + if self._strategy == MediaDescriptionStrategy.OPENAI: + if self._openai_client is None or self._openai_model is None: + raise ValueError("OpenAI strategy requires both a client and a model name") + self._media_describer = MultimodalModelDescriber( + self._openai_client, model=self._openai_model, deployment=self._openai_deployment + ) + return self._media_describer + + logger.warning("Unknown media description strategy '%s'; skipping description", self._strategy) + return None + + def mark_content_understanding_ready(self) -> None: + """Record that the Content Understanding analyzer exists to avoid recreating it.""" + + self._content_understanding_ready = True + + async def describe(self, image_bytes: bytes) -> str | None: + """Generate a description for the provided image bytes if a describer is available.""" + + describer = await self.get_media_describer() + if describer is None: + return None + if isinstance(describer, ContentUnderstandingDescriber) and not self._content_understanding_ready: + await describer.create_analyzer() + self._content_understanding_ready = True + return await describer.describe_image(image_bytes) + + +def build_figure_markup(image: "ImageOnPage", description: Optional[str] = None) -> str: + """Create consistent HTML markup for a figure description on demand.""" + + caption_parts = [image.figure_id] + if image.title: + caption_parts.append(image.title) + caption = " ".join(part for part in caption_parts if part) + if description: + return f"
{caption}
{description}
" + return f"
{caption}
" + + +async def process_page_image( + *, + image: "ImageOnPage", + document_filename: str, + blob_manager: Optional["BaseBlobManager"], + image_embeddings_client: Optional["ImageEmbeddings"], + figure_processor: Optional["FigureProcessor"] = None, + user_oid: Optional[str] = None, +) -> "ImageOnPage": + """Generate description, upload image, and optionally compute embedding for a figure. + + Relaxed from previous version: + - Only blob_manager is strictly required (for upload). + - image_embeddings_client may be None (embedding step skipped). + - Returns the mutated ImageOnPage for easier functional-style use. + """ + + if blob_manager is None: + raise ValueError("BlobManager must be provided to process images.") + + # Generate plain (model) description text only; do not wrap in HTML markup here. + description_text: str | None = None + if figure_processor is not None: + description_text = await figure_processor.describe(image.bytes) + + # Store plain descriptive text (can be None). HTML rendering is deferred to build_figure_markup. + image.description = description_text + + if image.url is None: + image.url = await blob_manager.upload_document_image( + document_filename, image.bytes, image.filename, image.page_num, user_oid=user_oid + ) + + if image_embeddings_client is not None: + try: + image.embedding = await image_embeddings_client.create_embedding_for_image(image.bytes) + except Exception: # pragma: no cover - embedding failures shouldn't abort figure processing + logger.warning("Image embedding generation failed for figure %s", image.figure_id, exc_info=True) + + return image diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py index 16b7f1aff9..e7f7e393f7 100644 --- a/app/backend/prepdocslib/filestrategy.py +++ b/app/backend/prepdocslib/filestrategy.py @@ -1,15 +1,19 @@ import logging from typing import Optional -from azure.core.credentials import AzureKeyCredential - from .blobmanager import AdlsBlobManager, BaseBlobManager, BlobManager from .embeddings import ImageEmbeddings, OpenAIEmbeddings +from .figureprocessor import ( + FigureProcessor, + MediaDescriptionStrategy, + process_page_image, +) from .fileprocessor import FileProcessor from .listfilestrategy import File, ListFileStrategy from .mediadescriber import ContentUnderstandingDescriber from .searchmanager import SearchManager, Section from .strategy import DocumentAction, SearchInfo, Strategy +from .textprocessor import process_text logger = logging.getLogger("scripts") @@ -20,8 +24,10 @@ async def parse_file( category: Optional[str] = None, blob_manager: Optional[BaseBlobManager] = None, image_embeddings_client: Optional[ImageEmbeddings] = None, + figure_processor: Optional[FigureProcessor] = None, user_oid: Optional[str] = None, ) -> list[Section]: + key = file.file_extension().lower() processor = file_processors.get(key) if processor is None: @@ -29,23 +35,20 @@ async def parse_file( return [] logger.info("Ingesting '%s'", file.filename()) pages = [page async for page in processor.parser.parse(content=file.content)] + total_images = sum(len(page.images) for page in pages) + logger.info("Found %d images across %d pages", total_images, len(pages)) for page in pages: for image in page.images: - if not blob_manager or not image_embeddings_client: - raise ValueError("BlobManager and ImageEmbeddingsClient must be provided to parse images in the file.") - if image.url is None: - image.url = await blob_manager.upload_document_image( - file.filename(), image.bytes, image.filename, image.page_num, user_oid=user_oid - ) - if image_embeddings_client: - image.embedding = await image_embeddings_client.create_embedding_for_image(image.bytes) - logger.info("Splitting '%s' into sections", file.filename()) - sections = [Section(chunk, content=file, category=category) for chunk in processor.splitter.split_pages(pages)] - # For now, add the images back to each split chunk based off chunk.page_num - for section in sections: - section.chunk.images = [ - image for page in pages if page.page_num == section.chunk.page_num for image in page.images - ] + logger.info("Processing image '%s' on page %d", image.filename, page.page_num) + await process_page_image( + image=image, + document_filename=file.filename(), + blob_manager=blob_manager, + image_embeddings_client=image_embeddings_client, + figure_processor=figure_processor, + user_oid=user_oid, + ) + sections = process_text(pages, file, processor.splitter, category) return sections @@ -67,8 +70,7 @@ def __init__( search_field_name_embedding: Optional[str] = None, use_acls: bool = False, category: Optional[str] = None, - use_content_understanding: bool = False, - content_understanding_endpoint: Optional[str] = None, + figure_processor: Optional[FigureProcessor] = None, enforce_access_control: bool = False, ): self.list_file_strategy = list_file_strategy @@ -82,8 +84,7 @@ def __init__( self.search_info = search_info self.use_acls = use_acls self.category = category - self.use_content_understanding = use_content_understanding - self.content_understanding_endpoint = content_understanding_endpoint + self.figure_processor = figure_processor self.enforce_access_control = enforce_access_control def setup_search_manager(self): @@ -102,15 +103,14 @@ async def setup(self): self.setup_search_manager() await self.search_manager.create_index() - if self.use_content_understanding: - if self.content_understanding_endpoint is None: - raise ValueError("Content Understanding is enabled but no endpoint was provided") - if isinstance(self.search_info.credential, AzureKeyCredential): - raise ValueError( - "AzureKeyCredential is not supported for Content Understanding, use keyless auth instead" - ) - cu_manager = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.search_info.credential) - await cu_manager.create_analyzer() + if ( + self.figure_processor is not None + and self.figure_processor.strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING + ): + media_describer = await self.figure_processor.get_media_describer() + if isinstance(media_describer, ContentUnderstandingDescriber): + await media_describer.create_analyzer() + self.figure_processor.mark_content_understanding_ready() async def run(self): self.setup_search_manager() @@ -120,7 +120,12 @@ async def run(self): try: await self.blob_manager.upload_blob(file) sections = await parse_file( - file, self.file_processors, self.category, self.blob_manager, self.image_embeddings + file, + self.file_processors, + self.category, + self.blob_manager, + self.image_embeddings, + figure_processor=self.figure_processor, ) if sections: await self.search_manager.update_content(sections, url=file.url) @@ -151,12 +156,14 @@ def __init__( embeddings: Optional[OpenAIEmbeddings] = None, image_embeddings: Optional[ImageEmbeddings] = None, enforce_access_control: bool = False, + figure_processor: Optional[FigureProcessor] = None, ): self.file_processors = file_processors self.embeddings = embeddings self.image_embeddings = image_embeddings self.search_info = search_info self.blob_manager = blob_manager + self.figure_processor = figure_processor self.search_manager = SearchManager( search_info=self.search_info, search_analyzer_name=None, @@ -164,14 +171,20 @@ def __init__( use_int_vectorization=False, embeddings=self.embeddings, field_name_embedding=search_field_name_embedding, - search_images=False, + search_images=image_embeddings is not None, enforce_access_control=enforce_access_control, ) self.search_field_name_embedding = search_field_name_embedding async def add_file(self, file: File, user_oid: str): sections = await parse_file( - file, self.file_processors, None, self.blob_manager, self.image_embeddings, user_oid=user_oid + file, + self.file_processors, + None, + self.blob_manager, + self.image_embeddings, + figure_processor=self.figure_processor, + user_oid=user_oid, ) if sections: await self.search_manager.update_content(sections, url=file.url) diff --git a/app/backend/prepdocslib/ingestionhelpers.py b/app/backend/prepdocslib/ingestionhelpers.py new file mode 100644 index 0000000000..e955035728 --- /dev/null +++ b/app/backend/prepdocslib/ingestionhelpers.py @@ -0,0 +1,91 @@ +"""Shared ingestion helper functions for parser selection and setup. + +These utilities allow both local scripts (prepdocs.py) and Azure Functions +(document_extractor) to reuse consistent logic for selecting parsers. +""" + +from __future__ import annotations + +from azure.core.credentials import AzureKeyCredential +from azure.core.credentials_async import AsyncTokenCredential + +from .htmlparser import LocalHTMLParser +from .parser import Parser +from .pdfparser import DocumentAnalysisParser, LocalPdfParser +from .textparser import TextParser + + +def select_parser( + *, + file_name: str, + content_type: str, + azure_credential: AsyncTokenCredential, + document_intelligence_service: str | None, + document_intelligence_key: str | None = None, + process_figures: bool = False, + use_local_pdf_parser: bool = False, + use_local_html_parser: bool = False, +) -> Parser: + """Return a parser instance appropriate for the file type and configuration. + + Args: + file_name: Source filename (used to derive extension) + content_type: MIME type (fallback for extension-based selection) + azure_credential: Token credential for DI service + document_intelligence_service: Name of DI service (None disables DI) + document_intelligence_key: Optional key credential (overrides token when provided) + process_figures: Whether figure extraction should be enabled in DI parser + use_local_pdf_parser: Force local PDF parsing instead of DI + use_local_html_parser: Force local HTML parsing instead of DI + + Returns: + Parser capable of yielding Page objects for the document. + + Raises: + ValueError: Unsupported file type or missing DI configuration for required formats. + """ + extension = file_name.lower().rsplit(".", 1)[-1] if "." in file_name else "" + ext_with_dot = f".{extension}" if extension else "" + + # Build DI parser lazily only if needed + di_parser: DocumentAnalysisParser | None = None + if document_intelligence_service: + credential: AsyncTokenCredential | AzureKeyCredential + if document_intelligence_key: + credential = AzureKeyCredential(document_intelligence_key) + else: + credential = azure_credential + di_parser = DocumentAnalysisParser( + endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", + credential=credential, + process_figures=process_figures, + ) + + # Plain text / structured text formats always local + if ext_with_dot in {".txt", ".md", ".csv", ".json"} or content_type.startswith("text/plain"): + return TextParser() + + # HTML + if ext_with_dot in {".html", ".htm"} or content_type in {"text/html", "application/html"}: + if use_local_html_parser or not di_parser: + return LocalHTMLParser() + return di_parser + + # PDF + if ext_with_dot == ".pdf": + if use_local_pdf_parser or not di_parser: + return LocalPdfParser() + return di_parser + + # Formats requiring DI + di_required_exts = {".docx", ".pptx", ".xlsx", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".heic"} + if ext_with_dot in di_required_exts: + if not di_parser: + raise ValueError("Document Intelligence service must be configured to process this file type") + return di_parser + + # Fallback: if MIME suggests application/* and DI available, use DI + if content_type.startswith("application/") and di_parser: + return di_parser + + raise ValueError(f"Unsupported file type: {file_name}") diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index b87a81e88f..3cfaba6819 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -1,5 +1,6 @@ -from dataclasses import dataclass, field -from typing import Optional +import base64 +from dataclasses import asdict, dataclass, field +from typing import Any, Optional @dataclass @@ -7,11 +8,89 @@ class ImageOnPage: bytes: bytes bbox: tuple[float, float, float, float] # Pixels filename: str - description: str figure_id: str page_num: int # 0-indexed + placeholder: str # HTML placeholder in page text, e.g. '
' + mime_type: str = "image/png" # Set by parser; default assumes PNG rendering url: Optional[str] = None + title: str = "" embedding: Optional[list[float]] = None + description: Optional[str] = None + + def to_skill_payload( + self, + file_name: str, + *, + include_bytes: bool = False, + include_bytes_base64: bool = True, + ) -> dict[str, Any]: + """Serialize this figure for the figure_processor skill output. + + Parameters: + file_name: Source document file name. + include_bytes: When True, include the raw ``bytes`` field. Defaults to False to avoid + bloating payload size and because JSON serialization of raw bytes is not desired. + include_bytes_base64: When True (default), include a base64 representation of the image + as ``bytes_base64`` for downstream skills that might still need the encoded image. + + Notes: + - Previous behavior always included both the raw bytes (via ``asdict``) and a base64 copy. + This is wasteful for typical pipelines where only the blob ``url`` plus lightweight + metadata are required. The new defaults favor minimal payload size. + - Callers needing the raw bytes can opt-in with ``include_bytes=True`` (e.g., for a + chained skill that has not yet persisted the blob or for debugging scenarios). + """ + + data = asdict(self) + + if not include_bytes and "bytes" in data: + # Remove raw bytes to keep payload lean (and JSON-friendly without extra handling). + data.pop("bytes", None) + + if include_bytes_base64: + # Always base64 from the current in-memory bytes, not from any cached version, to ensure fidelity. + b = self.bytes if isinstance(self.bytes, (bytes, bytearray)) else b"" + data["bytes_base64"] = base64.b64encode(b).decode("utf-8") + + data["document_file_name"] = file_name + return data + + @classmethod + def from_skill_payload(cls, data: dict[str, Any]) -> tuple["ImageOnPage", str]: + """Deserialize a figure skill payload into an ImageOnPage, normalizing fields.""" + # Decode base64 image data + bytes_base64 = data.get("bytes_base64") + if not bytes_base64: + raise ValueError("Figure payload missing required bytes_base64 field") + try: + raw_bytes = base64.b64decode(bytes_base64) + except Exception as exc: # pragma: no cover - defensive + raise ValueError("Invalid bytes_base64 image data") from exc + + # page_num may arrive as str; coerce + try: + page_num = int(data.get("page_num") or 0) + except Exception: + page_num = 0 + + # bbox may arrive as list; coerce into tuple + bbox_val = data.get("bbox") + if isinstance(bbox_val, list) and len(bbox_val) == 4: + bbox = tuple(bbox_val) # type: ignore[assignment] + else: + bbox = (0, 0, 0, 0) + + image = cls( + bytes=raw_bytes, + bbox=bbox, + page_num=page_num, + filename=data.get("filename"), + figure_id=data.get("figure_id"), + placeholder=data.get("placeholder"), + mime_type=data.get("mime_type") or "image/png", + title=data.get("title"), + ) + return image, data.get("document_file_name", "") @dataclass @@ -29,6 +108,7 @@ class Page: offset: int text: str images: list[ImageOnPage] = field(default_factory=list) + tables: list[str] = field(default_factory=list) @dataclass diff --git a/app/backend/prepdocslib/pdfparser.py b/app/backend/prepdocslib/pdfparser.py index a7996ea573..095dd66bca 100644 --- a/app/backend/prepdocslib/pdfparser.py +++ b/app/backend/prepdocslib/pdfparser.py @@ -4,7 +4,7 @@ import uuid from collections.abc import AsyncGenerator from enum import Enum -from typing import IO, Optional, Union +from typing import IO, Union import pymupdf from azure.ai.documentintelligence.aio import DocumentIntelligenceClient @@ -17,15 +17,9 @@ from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.core.exceptions import HttpResponseError -from openai import AsyncOpenAI from PIL import Image from pypdf import PdfReader -from .mediadescriber import ( - ContentUnderstandingDescriber, - MediaDescriber, - MultimodalModelDescriber, -) from .page import ImageOnPage, Page from .parser import Parser @@ -50,12 +44,6 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: offset += len(page_text) -class MediaDescriptionStrategy(Enum): - NONE = "none" - OPENAI = "openai" - CONTENTUNDERSTANDING = "content_understanding" - - class DocumentAnalysisParser(Parser): """ Concrete parser backed by Azure AI Document Intelligence that can parse many document formats into pages @@ -65,30 +53,14 @@ class DocumentAnalysisParser(Parser): def __init__( self, endpoint: str, - credential: Union[AsyncTokenCredential, AzureKeyCredential], - model_id="prebuilt-layout", - media_description_strategy: Enum = MediaDescriptionStrategy.NONE, - # If using OpenAI, this is the client to use - openai_client: Union[AsyncOpenAI, None] = None, - openai_model: Optional[str] = None, - openai_deployment: Optional[str] = None, - # If using Content Understanding, this is the endpoint for the service - content_understanding_endpoint: Union[str, None] = None, - # should this take the blob storage info too? - ): + credential: AsyncTokenCredential | AzureKeyCredential, + model_id: str = "prebuilt-layout", + process_figures: bool = False, + ) -> None: self.model_id = model_id self.endpoint = endpoint self.credential = credential - self.media_description_strategy = media_description_strategy - if media_description_strategy == MediaDescriptionStrategy.OPENAI: - logger.info("Including media description with OpenAI") - self.use_content_understanding = False - self.openai_client = openai_client - self.openai_model = openai_model - self.openai_deployment = openai_deployment - if media_description_strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: - logger.info("Including media description with Azure Content Understanding") - self.content_understanding_endpoint = content_understanding_endpoint + self.process_figures = process_figures async def parse(self, content: IO) -> AsyncGenerator[Page, None]: logger.info("Extracting text from '%s' using Azure Document Intelligence", content.name) @@ -97,27 +69,7 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: endpoint=self.endpoint, credential=self.credential ) as document_intelligence_client: file_analyzed = False - - media_describer: Union[ContentUnderstandingDescriber, MultimodalModelDescriber, None] = None - if self.media_description_strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: - if self.content_understanding_endpoint is None: - raise ValueError( - "Content Understanding endpoint must be provided when using Content Understanding strategy" - ) - if isinstance(self.credential, AzureKeyCredential): - raise ValueError( - "AzureKeyCredential is not supported for Content Understanding, use keyless auth instead" - ) - media_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential) - - if self.media_description_strategy == MediaDescriptionStrategy.OPENAI: - if self.openai_client is None or self.openai_model is None: - raise ValueError("OpenAI client must be provided when using OpenAI media description strategy") - media_describer = MultimodalModelDescriber( - self.openai_client, self.openai_model, self.openai_deployment - ) - - if media_describer is not None: + if self.process_figures: content_bytes = content.read() try: poller = await document_intelligence_client.begin_analyze_document( @@ -156,13 +108,14 @@ async def parse(self, content: IO) -> AsyncGenerator[Page, None]: if table.bounding_regions and table.bounding_regions[0].page_number == page.page_number ] figures_on_page = [] - if self.media_description_strategy != MediaDescriptionStrategy.NONE: + if self.process_figures: figures_on_page = [ figure for figure in (analyze_result.figures or []) if figure.bounding_regions and figure.bounding_regions[0].page_number == page.page_number ] page_images: list[ImageOnPage] = [] + page_tables: list[str] = [] class ObjectType(Enum): NONE = -1 @@ -200,46 +153,52 @@ class ObjectType(Enum): if object_idx is None: raise ValueError("Expected object_idx to be set") if mask_char not in added_objects: - page_text += DocumentAnalysisParser.table_to_html(tables_on_page[object_idx]) + table_html = DocumentAnalysisParser.table_to_html(tables_on_page[object_idx]) + page_tables.append(table_html) + page_text += table_html added_objects.add(mask_char) elif object_type == ObjectType.FIGURE: - if media_describer is None: - raise ValueError("media_describer should not be None, unable to describe figure") if object_idx is None: raise ValueError("Expected object_idx to be set") if mask_char not in added_objects: - image_on_page = await DocumentAnalysisParser.process_figure( - doc_for_pymupdf, figures_on_page[object_idx], media_describer + image_on_page = await DocumentAnalysisParser.figure_to_image( + doc_for_pymupdf, figures_on_page[object_idx] ) page_images.append(image_on_page) - page_text += image_on_page.description + page_text += image_on_page.placeholder added_objects.add(mask_char) + # We remove these comments since they are not needed and skew the page numbers page_text = page_text.replace("", "") # We remove excess newlines at the beginning and end of the page page_text = page_text.strip() - yield Page(page_num=page.page_number - 1, offset=offset, text=page_text, images=page_images) + yield Page( + page_num=page.page_number - 1, + offset=offset, + text=page_text, + images=page_images, + tables=page_tables, + ) offset += len(page_text) @staticmethod - async def process_figure( - doc: pymupdf.Document, figure: DocumentFigure, media_describer: MediaDescriber - ) -> ImageOnPage: + async def figure_to_image(doc: pymupdf.Document, figure: DocumentFigure) -> ImageOnPage: figure_title = (figure.caption and figure.caption.content) or "" # Generate a random UUID if figure.id is None figure_id = figure.id or f"fig_{uuid.uuid4().hex[:8]}" figure_filename = f"figure{figure_id.replace('.', '_')}.png" - logger.info( - "Describing figure %s with title '%s' using %s", figure_id, figure_title, type(media_describer).__name__ - ) + logger.info("Cropping figure %s with title '%s'", figure_id, figure_title) + placeholder = f'
' if not figure.bounding_regions: return ImageOnPage( bytes=b"", - page_num=0, # O-indexed + page_num=0, # 0-indexed figure_id=figure_id, bbox=(0, 0, 0, 0), filename=figure_filename, - description=f"
{figure_id} {figure_title}
", + title=figure_title, + placeholder=placeholder, + mime_type="image/png", ) if len(figure.bounding_regions) > 1: logger.warning("Figure %s has more than one bounding region, using the first one", figure_id) @@ -253,14 +212,15 @@ async def process_figure( ) page_number = first_region["pageNumber"] # 1-indexed cropped_img, bbox_pixels = DocumentAnalysisParser.crop_image_from_pdf_page(doc, page_number - 1, bounding_box) - figure_description = await media_describer.describe_image(cropped_img) return ImageOnPage( bytes=cropped_img, page_num=page_number - 1, # Convert to 0-indexed figure_id=figure_id, bbox=bbox_pixels, filename=figure_filename, - description=f"
{figure_id} {figure_title}
{figure_description}
", + title=figure_title, + placeholder=placeholder, + mime_type="image/png", ) @staticmethod diff --git a/app/backend/prepdocslib/servicesetup.py b/app/backend/prepdocslib/servicesetup.py new file mode 100644 index 0000000000..dbdbe7aaea --- /dev/null +++ b/app/backend/prepdocslib/servicesetup.py @@ -0,0 +1,202 @@ +"""Shared service setup helpers for OpenAI and multimodal image embeddings. + +This module centralizes logic that was previously duplicated between the local +ingestion script (`prepdocs.py`) and Azure Functions (e.g. `figure_processor`). + +Functions exported: + - setup_openai_client: Create an Async OpenAI / Azure OpenAI client using + either key auth or passwordless (Managed Identity / Developer CLI). + - setup_image_embeddings_service: Create an ImageEmbeddings helper when + multimodal features are enabled. + +The goal is to keep these concerns DRY so that credential / endpoint handling +stays consistent across ingestion pathways. +""" + +from __future__ import annotations + +import logging +import os +from enum import Enum + +from azure.core.credentials_async import AsyncTokenCredential +from azure.identity.aio import get_bearer_token_provider +from openai import AsyncAzureOpenAI, AsyncOpenAI + +from .blobmanager import BlobManager +from .embeddings import ImageEmbeddings +from .figureprocessor import FigureProcessor, MediaDescriptionStrategy + +logger = logging.getLogger("scripts") + + +class OpenAIHost(str, Enum): + """Supported OpenAI hosting styles. + + OPENAI: Public OpenAI API. + AZURE: Standard Azure OpenAI (service name becomes endpoint). + AZURE_CUSTOM: A fully custom endpoint URL (for Network Isolation / APIM). + LOCAL: A locally hosted OpenAI-compatible endpoint (no key required). + """ + + OPENAI = "openai" + AZURE = "azure" + AZURE_CUSTOM = "azure_custom" + LOCAL = "local" + + +def setup_openai_client( + *, + openai_host: OpenAIHost, + azure_credential: AsyncTokenCredential, + azure_openai_api_key: str | None = None, + azure_openai_api_version: str | None = None, + azure_openai_service: str | None = None, + azure_openai_custom_url: str | None = None, + azure_openai_chat_deployment: str | None = None, + openai_api_key: str | None = None, + openai_organization: str | None = None, +) -> AsyncOpenAI: + """Create an Async OpenAI client for either Azure-hosted or public OpenAI API. + + For Azure, passwordless auth (Managed Identity / Developer CLI) is used + unless an override API key is provided. A chat deployment name can be + passed for convenience when downstream code requires it. + """ + + if openai_host not in OpenAIHost: + raise ValueError(f"Invalid OPENAI_HOST value: {openai_host}.") + + if openai_host in (OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM): + if openai_host == OpenAIHost.AZURE_CUSTOM: + if not azure_openai_custom_url: + raise ValueError("AZURE_OPENAI_CUSTOM_URL must be set for azure_custom host") + endpoint = azure_openai_custom_url + else: + if not azure_openai_service: + raise ValueError("AZURE_OPENAI_SERVICE must be set for azure host") + endpoint = f"https://{azure_openai_service}.openai.azure.com" + + if azure_openai_api_key: + logger.info("Using Azure OpenAI key override for client auth") + return AsyncAzureOpenAI( + api_version=azure_openai_api_version, + azure_endpoint=endpoint, + api_key=azure_openai_api_key, + azure_deployment=azure_openai_chat_deployment, + ) + logger.info("Using passwordless auth (token provider) for Azure OpenAI client") + token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default") + return AsyncAzureOpenAI( + api_version=azure_openai_api_version, + azure_endpoint=endpoint, + azure_ad_token_provider=token_provider, + azure_deployment=azure_openai_chat_deployment, + ) + + if openai_host == OpenAIHost.LOCAL: + base_url = os.environ.get("OPENAI_BASE_URL") + if not base_url: + raise ValueError("OPENAI_BASE_URL must be set when OPENAI_HOST=local") + logger.info("Using local OpenAI-compatible endpoint: %s", base_url) + return AsyncOpenAI(base_url=base_url, api_key="no-key-required") + + # Public OpenAI API + if openai_api_key is None: + raise ValueError("OPENAI_API_KEY is required for public OpenAI host") + logger.info("Using public OpenAI host with key authentication") + return AsyncOpenAI(api_key=openai_api_key, organization=openai_organization) + + +def setup_image_embeddings_service( + *, + azure_credential: AsyncTokenCredential, + vision_endpoint: str | None, + use_multimodal: bool, +) -> ImageEmbeddings | None: + """Create an ImageEmbeddings helper if multimodal features are enabled. + + Returns None when multimodal is disabled so calling code can skip image + embedding generation gracefully. + """ + + if not use_multimodal: + logger.info("Multimodal disabled; not creating image embeddings service") + return None + if vision_endpoint is None: + raise ValueError("An Azure AI Vision endpoint must be provided for multimodal features") + + token_provider = get_bearer_token_provider(azure_credential, "https://cognitiveservices.azure.com/.default") + logger.info("Creating ImageEmbeddings service for endpoint %s", vision_endpoint) + return ImageEmbeddings(endpoint=vision_endpoint, token_provider=token_provider) + + +__all__ = [ + "OpenAIHost", + "setup_openai_client", + "setup_image_embeddings_service", + "setup_blob_manager", + "setup_figure_processor", +] + + +def setup_blob_manager( + *, + storage_account: str, + storage_container: str, + credential: AsyncTokenCredential | str, + storage_resource_group: str | None = None, + subscription_id: str | None = None, + image_storage_container: str | None = None, +) -> BlobManager: + """Create a BlobManager instance for document or figure storage. + + The optional resource group and subscription are retained for parity with + local ingestion (used for diagnostic operations) but not required by + Azure Functions. Image container may differ from the main document + container when figures are stored separately. + """ + endpoint = f"https://{storage_account}.blob.core.windows.net" + return BlobManager( + endpoint=endpoint, + container=storage_container, + account=storage_account, + credential=credential, + resource_group=storage_resource_group, + subscription_id=subscription_id, + image_container=image_storage_container, + ) + + +def setup_figure_processor( + *, + credential: AsyncTokenCredential | None, + use_multimodal: bool, + use_content_understanding: bool, + content_understanding_endpoint: str | None, + openai_client: object | None, + openai_model: str | None, + openai_deployment: str | None, +) -> FigureProcessor | None: + """Create a FigureProcessor based on feature flags. + + Priority order: + 1. use_multimodal -> MediaDescriptionStrategy.OPENAI + 2. else if use_content_understanding and endpoint -> CONTENTUNDERSTANDING + 3. else -> return None (no figure description) + """ + if use_multimodal: + return FigureProcessor( + credential=credential, + strategy=MediaDescriptionStrategy.OPENAI, + openai_client=openai_client, + openai_model=openai_model, + openai_deployment=openai_deployment, + ) + if use_content_understanding and content_understanding_endpoint: + return FigureProcessor( + credential=credential, + strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, + content_understanding_endpoint=content_understanding_endpoint, + ) + return None diff --git a/app/backend/prepdocslib/textprocessor.py b/app/backend/prepdocslib/textprocessor.py new file mode 100644 index 0000000000..595951a0fb --- /dev/null +++ b/app/backend/prepdocslib/textprocessor.py @@ -0,0 +1,70 @@ +"""Utilities for processing document text and combining it with figure descriptions.""" + +import logging +from typing import TYPE_CHECKING + +if TYPE_CHECKING: # pragma: no cover - used only for type hints + from .listfilestrategy import File + from .page import Page + from .searchmanager import Section + from .textsplitter import TextSplitter + +logger = logging.getLogger("scripts") + + +def combine_text_with_figures(page: "Page") -> None: + """Replace figure placeholders in page text with full description markup. + + This is Skill #3 (text_processor) in the three-skill pipeline. + After figures have been described and enriched, this replaces their + placeholders in the page text with the full
markup. + """ + + for image in page.images: + if image.description and image.placeholder in page.text: + page.text = page.text.replace(image.placeholder, image.description) + logger.info("Replaced placeholder for figure %s with description markup", image.figure_id) + elif not image.description: + logger.debug("No description for figure %s; keeping placeholder", image.figure_id) + elif image.placeholder not in page.text: + logger.warning("Placeholder not found for figure %s in page %d", image.figure_id, page.page_num) + + +def process_text( + pages: list["Page"], + file: "File", + splitter: "TextSplitter", + category: str | None = None, +) -> list["Section"]: + """Process document text and figures into searchable sections. + + This is Skill #3 (text_processor) in the three-skill pipeline. + Combines text with figure descriptions, splits into chunks, and + associates figures with their containing sections. + + Args: + pages: List of parsed pages with enriched figures + file: Original file being processed + splitter: Text splitter for chunking content + category: Optional category for sections + + Returns: + List of Sections ready for indexing + """ + from .searchmanager import Section + + # Step 1: Combine text with figures on each page + for page in pages: + combine_text_with_figures(page) + + # Step 2: Split combined text into chunks + logger.info("Splitting '%s' into sections", file.filename()) + sections = [Section(chunk, content=file, category=category) for chunk in splitter.split_pages(pages)] + + # Step 3: Add images back to each section based on page number + for section in sections: + section.chunk.images = [ + image for page in pages if page.page_num == section.chunk.page_num for image in page.images + ] + + return sections diff --git a/app/functions/document_extractor/.funcignore b/app/functions/document_extractor/.funcignore new file mode 100644 index 0000000000..5470b4c51a --- /dev/null +++ b/app/functions/document_extractor/.funcignore @@ -0,0 +1,11 @@ +.git* +.vscode +__pycache__ +*.pyc +.python_packages +.venv +local.settings.json +test +.pytest_cache +.coverage +*.md diff --git a/app/functions/document_extractor/function_app.py b/app/functions/document_extractor/function_app.py new file mode 100644 index 0000000000..6b89a309c0 --- /dev/null +++ b/app/functions/document_extractor/function_app.py @@ -0,0 +1,210 @@ +""" +Azure Function: Document Extractor +Custom skill for Azure AI Search that extracts and processes document content. +""" + +import base64 +import io +import json +import logging +import os +from typing import Any + +import azure.functions as func +from azure.core.exceptions import HttpResponseError +from azure.identity.aio import ManagedIdentityCredential + +from prepdocslib.ingestionhelpers import select_parser +from prepdocslib.page import Page + +app = func.FunctionApp() + +logger = logging.getLogger(__name__) + + +USE_LOCAL_PDF_PARSER = os.getenv("USE_LOCAL_PDF_PARSER", "false").lower() == "true" +USE_LOCAL_HTML_PARSER = os.getenv("USE_LOCAL_HTML_PARSER", "false").lower() == "true" +USE_MULTIMODAL = os.getenv("USE_MULTIMODAL", "false").lower() == "true" + +DOCUMENT_INTELLIGENCE_SERVICE = os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE") + +# Eagerly create a single managed identity credential instance for the worker. +if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): + logger.info("Using Managed Identity with client ID: %s", AZURE_CLIENT_ID) + AZURE_CREDENTIAL = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) +else: + logger.info("Using default Managed Identity without client ID") + AZURE_CREDENTIAL = ManagedIdentityCredential() + + +@app.function_name(name="extract") +@app.route(route="extract", methods=["POST"]) +async def extract_document(req: func.HttpRequest) -> func.HttpResponse: + """ + Azure Search Custom Skill: Extract document content + + Input format (single record; file data only): + # https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-document-intelligence-layout#skill-inputs + { + "values": [ + { + "recordId": "1", + "data": { + // Base64 encoded file (skillset must enable file data) + "file_data": { + "$type": "file", + "data": "base64..." + }, + // Optional + "file_name": "doc.pdf" + } + } + ] + } + + Output format (snake_case only): + { + "values": [ + { + "recordId": "1", + "data": { + "pages": [ + {"page_num": 0, "text": "Page 1 text", "figure_ids": ["fig1"]}, + {"page_num": 1, "text": "Page 2 text", "figure_ids": []} + ], + "figures": [ + { + "figure_id": "fig1", + "page_num": 0, + "document_file_name": "doc.pdf", + "filename": "fig1.png", + "mime_type": "image/png", + "bytes_base64": "...", + "bbox": [100,150,300,400], + "title": "Figure Title", + "placeholder": "
" + } + ] + }, + "errors": [], + "warnings": [] + } + ] + } + """ + try: + # Parse custom skill input + req_body = req.get_json() + input_values = req_body.get("values", []) + + if len(input_values) != 1: + raise ValueError("document_extractor expects exactly one record per request, set batchSize to 1.") + + input_record = input_values[0] + record_id = input_record.get("recordId", "") + data = input_record.get("data", {}) + + try: + result = await process_document(data) + output_values = [ + { + "recordId": record_id, + "data": result, + "errors": [], + "warnings": [], + } + ] + except Exception as e: + logger.error(f"Error processing record {record_id}: {str(e)}", exc_info=True) + output_values = [ + { + "recordId": record_id, + "data": {}, + "errors": [{"message": str(e)}], + "warnings": [], + } + ] + + return func.HttpResponse(json.dumps({"values": output_values}), mimetype="application/json", status_code=200) + + except Exception as e: + logger.error(f"Fatal error in extract_document: {str(e)}", exc_info=True) + return func.HttpResponse(json.dumps({"error": str(e)}), mimetype="application/json", status_code=500) + + +async def process_document(data: dict[str, Any]) -> dict[str, Any]: + """ + Process a single document: download, parse, extract figures, upload images + + Args: + data: Input data with blobUrl, fileName, contentType + + Returns: + Dictionary with 'text' (markdown) and 'images' (list of {url, description}) + """ + document_stream, file_name, content_type = get_document_stream_filedata(data) + logger.info("Processing document: %s", file_name) + + parser = select_parser( + file_name=file_name, + content_type=content_type, + azure_credential=AZURE_CREDENTIAL, + document_intelligence_service=DOCUMENT_INTELLIGENCE_SERVICE or None, + document_intelligence_key=None, + process_figures=USE_MULTIMODAL, + use_local_pdf_parser=USE_LOCAL_PDF_PARSER, + use_local_html_parser=USE_LOCAL_HTML_PARSER, + ) + + pages: list[Page] = [] + try: + document_stream.seek(0) + pages = [page async for page in parser.parse(content=document_stream)] + except HttpResponseError as exc: + raise ValueError(f"Parser failed for {file_name}: {exc.message}") from exc + finally: + document_stream.close() + + components = build_document_components(file_name, pages) + return components + + +def get_document_stream_filedata(data: dict[str, Any]) -> tuple[io.BytesIO, str, str]: + """Return a BytesIO stream for file_data input only (skillset must send file bytes).""" + file_payload = data.get("file_data", {}) + encoded = file_payload.get("data") + if not encoded: + raise ValueError("file_data payload missing base64 data") + document_bytes = base64.b64decode(encoded) + file_name = data.get("file_name") or data.get("fileName") or file_payload.get("name") or "document" + content_type = data.get("contentType") or file_payload.get("contentType") or "application/octet-stream" + stream = io.BytesIO(document_bytes) + stream.name = file_name + return stream, file_name, content_type + + +def build_document_components(file_name: str, pages: list[Page]) -> dict[str, Any]: + page_entries: list[dict[str, Any]] = [] + figure_entries: list[dict[str, Any]] = [] + + for page in pages: + page_text = page.text or "" + figure_ids_on_page: list[str] = [] + if page.images: + for image in page.images: + figure_ids_on_page.append(image.figure_id) + figure_entries.append(image.to_skill_payload(file_name)) + + page_entries.append( + { + "page_num": page.page_num, + "text": page_text, + "figure_ids": figure_ids_on_page, + } + ) + + return { + "file_name": file_name, + "pages": page_entries, + "figures": figure_entries, + } diff --git a/app/functions/document_extractor/host.json b/app/functions/document_extractor/host.json new file mode 100644 index 0000000000..20f502bb31 --- /dev/null +++ b/app/functions/document_extractor/host.json @@ -0,0 +1,20 @@ +{ + "version": "2.0", + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + }, + "functionTimeout": "00:10:00", + "logging": { + "logLevel": { + "default": "Information", + "Function": "Information" + }, + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "maxTelemetryItemsPerSecond": 20 + } + } + } +} diff --git a/app/functions/document_extractor/requirements.txt b/app/functions/document_extractor/requirements.txt new file mode 100644 index 0000000000..fd693a5f09 --- /dev/null +++ b/app/functions/document_extractor/requirements.txt @@ -0,0 +1,9 @@ +# Azure Functions runtime +azure-functions>=1.21.3,<2.0.0 + +# Local dependency: prepdocslib +../../backend/prepdocslib + +# Note: prepdocslib's dependencies will be pulled in automatically +# including: azure-ai-documentintelligence, azure-storage-blob, +# azure-identity, openai, pymupdf, beautifulsoup4, etc. diff --git a/app/functions/figure_processor/.funcignore b/app/functions/figure_processor/.funcignore new file mode 100644 index 0000000000..5470b4c51a --- /dev/null +++ b/app/functions/figure_processor/.funcignore @@ -0,0 +1,11 @@ +.git* +.vscode +__pycache__ +*.pyc +.python_packages +.venv +local.settings.json +test +.pytest_cache +.coverage +*.md diff --git a/app/functions/figure_processor/function_app.py b/app/functions/figure_processor/function_app.py new file mode 100644 index 0000000000..a883220110 --- /dev/null +++ b/app/functions/figure_processor/function_app.py @@ -0,0 +1,174 @@ +""" +Azure Function: Figure Processor +Custom skill for Azure AI Search that enriches figure payloads emitted by the document extractor. + +This function: +1. Accepts raw figure bytes and metadata (one record per request due to skill fanout). +2. Uploads rendered figure images to blob storage with citation overlays. +3. Generates natural-language captions via Azure OpenAI or Content Understanding (when configured). +4. Optionally computes image embeddings using Azure AI Vision (when multimodal is enabled). +5. Returns enriched figure metadata back to the indexer for downstream text processing. +""" + +from __future__ import annotations + +import json +import logging +import os +from typing import Any + +import azure.functions as func +from azure.identity.aio import ManagedIdentityCredential, get_bearer_token_provider + +from prepdocslib.blobmanager import BlobManager +from prepdocslib.embeddings import ImageEmbeddings +from prepdocslib.figureprocessor import FigureProcessor, process_page_image +from prepdocslib.page import ImageOnPage +from prepdocslib.servicesetup import ( + OpenAIHost, + setup_blob_manager, + setup_figure_processor, + setup_openai_client, +) + +app = func.FunctionApp() + +logger = logging.getLogger(__name__) + +# Environment configuration +AZURE_STORAGE_ACCOUNT = os.getenv("AZURE_STORAGE_ACCOUNT", "") +IMAGE_CONTAINER = os.getenv("AZURE_IMAGESTORAGE_CONTAINER") or os.getenv("AZURE_STORAGE_CONTAINER", "") +USE_MULTIMODAL = os.getenv("USE_MULTIMODAL", "false").lower() == "true" +USE_MEDIA_DESCRIBER_AZURE_CU = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "false").lower() == "true" +CONTENT_UNDERSTANDING_ENDPOINT = os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT", "") +AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE", "") +AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL", "") +AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "") +AZURE_OPENAI_CHATGPT_DEPLOYMENT = os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "") +AZURE_OPENAI_CHATGPT_MODEL = os.getenv("AZURE_OPENAI_CHATGPT_MODEL", "") +AZURE_VISION_ENDPOINT = os.getenv("AZURE_VISION_ENDPOINT", "") + +BLOB_MANAGER: BlobManager | None +FIGURE_PROCESSOR: FigureProcessor | None +IMAGE_EMBEDDINGS: ImageEmbeddings | None + +# Single shared managed identity credential (matches document_extractor pattern) +if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): + logger.info("Using Managed Identity with client ID: %s", AZURE_CLIENT_ID) + GLOBAL_CREDENTIAL = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) +else: + logger.info("Using default Managed Identity without client ID") + GLOBAL_CREDENTIAL = ManagedIdentityCredential() + + +# Direct eager initialization (no helper functions) +# Blob Manager +if AZURE_STORAGE_ACCOUNT and IMAGE_CONTAINER: + BLOB_MANAGER = setup_blob_manager( + storage_account=AZURE_STORAGE_ACCOUNT, + storage_container=IMAGE_CONTAINER, + credential=GLOBAL_CREDENTIAL, + image_storage_container=IMAGE_CONTAINER, + ) +else: + logger.warning("Blob manager not initialized due to missing storage configuration") + BLOB_MANAGER = None + +# Figure Processor +_openai_client = None +_openai_model = None +_openai_deployment = None +openai_ready = ( + USE_MULTIMODAL + and AZURE_OPENAI_API_VERSION + and (AZURE_OPENAI_SERVICE or AZURE_OPENAI_CUSTOM_URL) + and AZURE_OPENAI_CHATGPT_DEPLOYMENT +) +if openai_ready: + _host = OpenAIHost.AZURE_CUSTOM if AZURE_OPENAI_CUSTOM_URL else OpenAIHost.AZURE + _openai_client = setup_openai_client( + openai_host=_host, + azure_credential=GLOBAL_CREDENTIAL, + azure_openai_api_version=AZURE_OPENAI_API_VERSION, + azure_openai_service=AZURE_OPENAI_SERVICE or None, + azure_openai_custom_url=AZURE_OPENAI_CUSTOM_URL or None, + azure_openai_chat_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT, + ) + _openai_model = AZURE_OPENAI_CHATGPT_MODEL or AZURE_OPENAI_CHATGPT_DEPLOYMENT + _openai_deployment = AZURE_OPENAI_CHATGPT_DEPLOYMENT +elif USE_MULTIMODAL: + logger.warning("USE_MULTIMODAL is true but Azure OpenAI configuration incomplete; disabling OPENAI strategy") + +FIGURE_PROCESSOR = setup_figure_processor( + credential=GLOBAL_CREDENTIAL, + use_multimodal=bool(openai_ready), + use_content_understanding=USE_MEDIA_DESCRIBER_AZURE_CU, + content_understanding_endpoint=CONTENT_UNDERSTANDING_ENDPOINT or None, + openai_client=_openai_client, + openai_model=_openai_model, + openai_deployment=_openai_deployment, +) + +# Image Embeddings +if USE_MULTIMODAL and AZURE_VISION_ENDPOINT: + _token_provider = get_bearer_token_provider(GLOBAL_CREDENTIAL, "https://cognitiveservices.azure.com/.default") + IMAGE_EMBEDDINGS = ImageEmbeddings(AZURE_VISION_ENDPOINT, _token_provider) +else: + IMAGE_EMBEDDINGS = None + + +@app.function_name(name="process_figure") +@app.route(route="process", methods=["POST"]) +async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse: + """Entrypoint for Azure Search custom skill calls.""" + + try: + payload = req.get_json() + except ValueError as exc: + logger.error("Failed to parse request body: %s", exc) + return func.HttpResponse( + json.dumps({"error": "Invalid JSON payload"}), + mimetype="application/json", + status_code=400, + ) + + input_values = payload.get("values", []) + output_values: list[dict[str, Any]] = [] + + for record in input_values: + record_id = record.get("recordId", "") + data = record.get("data", {}) + try: + image_on_page, file_name = ImageOnPage.from_skill_payload(data) + await process_page_image( + image=image_on_page, + document_filename=file_name, + blob_manager=BLOB_MANAGER, + image_embeddings_client=IMAGE_EMBEDDINGS, + figure_processor=FIGURE_PROCESSOR, + ) + figure_payload = image_on_page.to_skill_payload(file_name, include_bytes_base64=False, include_bytes=False) + output_values.append( + { + "recordId": record_id, + "data": figure_payload, + "errors": [], + "warnings": [], + } + ) + except Exception as exc: # pragma: no cover - defensive + logger.error("Error processing figure %s: %s", record_id, exc, exc_info=True) + output_values.append( + { + "recordId": record_id, + "data": {}, + "errors": [{"message": str(exc)}], + "warnings": [], + } + ) + + return func.HttpResponse( + json.dumps({"values": output_values}), + mimetype="application/json", + status_code=200, + ) diff --git a/app/functions/figure_processor/host.json b/app/functions/figure_processor/host.json new file mode 100644 index 0000000000..ae5ca6fb09 --- /dev/null +++ b/app/functions/figure_processor/host.json @@ -0,0 +1,20 @@ +{ + "version": "2.0", + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + }, + "functionTimeout": "00:06:00", + "logging": { + "logLevel": { + "default": "Information", + "Function": "Information" + }, + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "maxTelemetryItemsPerSecond": 20 + } + } + } +} diff --git a/app/functions/figure_processor/requirements.txt b/app/functions/figure_processor/requirements.txt new file mode 100644 index 0000000000..ca9be31e1c --- /dev/null +++ b/app/functions/figure_processor/requirements.txt @@ -0,0 +1,7 @@ +# Azure Functions runtime +azure-functions>=1.21.3,<2.0.0 + +# Local dependency: prepdocslib +../../backend/prepdocslib + +# prepdocslib brings in azure-identity, azure-storage-blob, openai, etc. diff --git a/app/functions/text_processor/.funcignore b/app/functions/text_processor/.funcignore new file mode 100644 index 0000000000..5470b4c51a --- /dev/null +++ b/app/functions/text_processor/.funcignore @@ -0,0 +1,11 @@ +.git* +.vscode +__pycache__ +*.pyc +.python_packages +.venv +local.settings.json +test +.pytest_cache +.coverage +*.md diff --git a/app/functions/text_processor/function_app.py b/app/functions/text_processor/function_app.py new file mode 100644 index 0000000000..6510c60940 --- /dev/null +++ b/app/functions/text_processor/function_app.py @@ -0,0 +1,231 @@ +"""Azure Function: Text Processor. + +Processes markdown text into search chunks with (optional) embeddings and figure metadata. +""" + +from __future__ import annotations + +import io +import json +import logging +import os +from typing import Any + +import azure.functions as func +from azure.identity.aio import ManagedIdentityCredential + +from prepdocslib.blobmanager import BlobManager +from prepdocslib.embeddings import AzureOpenAIEmbeddingService +from prepdocslib.listfilestrategy import File +from prepdocslib.page import ImageOnPage, Page +from prepdocslib.textprocessor import process_text +from prepdocslib.textsplitter import SentenceTextSplitter + +app = func.FunctionApp() + +logger = logging.getLogger(__name__) + +USE_VECTORS = os.getenv("USE_VECTORS", "true").lower() == "true" +USE_MULTIMODAL = os.getenv("USE_MULTIMODAL", "false").lower() == "true" + +AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE", "") +AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL", "") +AZURE_OPENAI_EMB_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT", "") +AZURE_OPENAI_EMB_MODEL_NAME = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-3-large") +AZURE_OPENAI_EMB_DIMENSIONS = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", "3072")) +AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "") + +GLOBAL_CREDENTIAL: ManagedIdentityCredential | None +EMBEDDING_SERVICE: AzureOpenAIEmbeddingService | None +SENTENCE_SPLITTER = SentenceTextSplitter() + +# --------------------------------------------------------------------------- +# Global credential initialisation (single shared Managed Identity credential) +# --------------------------------------------------------------------------- +if AZURE_CLIENT_ID := (os.getenv("AZURE_CLIENT_ID") or os.getenv("IDENTITY_CLIENT_ID") or os.getenv("MSI_CLIENT_ID")): + logger.info("Using Managed Identity with client ID: %s", AZURE_CLIENT_ID) + GLOBAL_CREDENTIAL = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) +else: + logger.info("Using default Managed Identity without explicit client ID") + GLOBAL_CREDENTIAL = ManagedIdentityCredential() + +# --------------------------------------------------------------------------- +# Embedding service initialisation (optional) +# --------------------------------------------------------------------------- +EMBEDDING_SERVICE = None +if USE_VECTORS: + embeddings_ready = ( + AZURE_OPENAI_API_VERSION + and (AZURE_OPENAI_SERVICE or AZURE_OPENAI_CUSTOM_URL) + and (AZURE_OPENAI_EMB_DEPLOYMENT or AZURE_OPENAI_EMB_MODEL_NAME) + ) + if embeddings_ready: + try: + EMBEDDING_SERVICE = AzureOpenAIEmbeddingService( + open_ai_service=AZURE_OPENAI_SERVICE or None, + open_ai_deployment=AZURE_OPENAI_EMB_DEPLOYMENT or None, + open_ai_model_name=AZURE_OPENAI_EMB_MODEL_NAME, + open_ai_dimensions=AZURE_OPENAI_EMB_DIMENSIONS, + open_ai_api_version=AZURE_OPENAI_API_VERSION, + credential=GLOBAL_CREDENTIAL, + open_ai_custom_url=AZURE_OPENAI_CUSTOM_URL or None, + ) + logger.info( + "Embedding service initialised (deployment=%s, model=%s, dims=%d)", + AZURE_OPENAI_EMB_DEPLOYMENT or AZURE_OPENAI_EMB_MODEL_NAME, + AZURE_OPENAI_EMB_MODEL_NAME, + AZURE_OPENAI_EMB_DIMENSIONS, + ) + except Exception as exc: # pragma: no cover - defensive initialisation + logger.error("Failed to initialise embedding service: %s", exc, exc_info=True) + EMBEDDING_SERVICE = None + else: + logger.warning("USE_VECTORS is true but embedding configuration incomplete; embeddings disabled") + + +@app.function_name(name="process_text") +@app.route(route="process", methods=["POST"]) +async def process_text_entry(req: func.HttpRequest) -> func.HttpResponse: + """Azure Search custom skill entry point for chunking and embeddings.""" + + try: + payload = req.get_json() + except ValueError as exc: + logger.error("Invalid JSON payload: %s", exc) + return func.HttpResponse( + json.dumps({"error": "Request body must be valid JSON"}), + mimetype="application/json", + status_code=400, + ) + + values = payload.get("values", []) + output_values: list[dict[str, Any]] = [] + + for record in values: + record_id = record.get("recordId", "") + data = record.get("data", {}) + try: + chunks = await _process_document(data) + output_values.append( + { + "recordId": record_id, + "data": {"chunks": chunks}, + "errors": [], + "warnings": [], + } + ) + except Exception as exc: # pragma: no cover - defensive logging + logger.error("Failed to process record %s: %s", record_id, exc, exc_info=True) + output_values.append( + { + "recordId": record_id, + "data": {}, + "errors": [{"message": str(exc)}], + "warnings": [], + } + ) + + return func.HttpResponse( + json.dumps({"values": output_values}), + mimetype="application/json", + status_code=200, + ) + + +async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: + """Combine figures with page text, split into chunks, and (optionally) embed. + + Parameters + ---------- + data: dict[str, Any] + Skill payload containing file metadata, pages, and figures. + + Returns + ------- + list[dict[str, Any]] + Chunk dictionaries ready for downstream indexing. + """ + + file_name = data.get("file_name", "document") + storage_url = data.get("storageUrl") or data.get("metadata_storage_path") or file_name + pages_input = data.get("pages", []) # [{page_num, text, figure_ids}] + figures_input = data.get("figures", []) # serialized skill payload + figures_by_id = {figure["figure_id"]: figure for figure in figures_input} + + # Build Page objects with placeholders intact (figure markup will be injected by combine_text_with_figures()) + pages: list[Page] = [] + offset = 0 + for page_entry in pages_input: + # Zero-based page numbering: pages emitted by extractor already zero-based + page_num = int(page_entry.get("page_num", len(pages))) + page_text = page_entry.get("text", "") + page_obj = Page(page_num=page_num, offset=offset, text=page_text) + offset += len(page_text) + + # Construct ImageOnPage objects from figureIds list + figure_ids: list[str] = page_entry.get("figure_ids", []) + for fid in figure_ids: + figure_payload = figures_by_id.get(fid) + if not figure_payload: + logger.warning("Figure ID %s not found in figures metadata for page %d", fid, page_num) + continue + image_on_page = ImageOnPage.from_skill_payload(figure_payload) + page_obj.images.append(image_on_page) + pages.append(page_obj) + + if not pages: + logger.info("No textual content found for %s", file_name) + return [] + + # Create a lightweight File wrapper required by process_text + dummy_stream = io.BytesIO(b"") + dummy_stream.name = file_name + file_wrapper = File(content=dummy_stream) + + sections = process_text(pages, file_wrapper, SENTENCE_SPLITTER, category=None) + if not sections: + return [] + + # Generate embeddings for section texts + chunk_texts = [s.chunk.text for s in sections] + embeddings: list[list[float]] | None = None + if USE_VECTORS and chunk_texts: + if EMBEDDING_SERVICE: + embeddings = await EMBEDDING_SERVICE.create_embeddings(chunk_texts) + else: + logger.warning("Embeddings requested but service not initialised; skipping vectors") + + # Use the same id base generation as local ingestion pipeline for parity + normalized_id = file_wrapper.filename_to_id() + outputs: list[dict[str, Any]] = [] + for idx, section in enumerate(sections): + content = section.chunk.text.strip() + if not content: + continue + embedding_vec = embeddings[idx] if embeddings else [] + image_refs: list[dict[str, Any]] = [] + for image in section.chunk.images: + ref = { + "id": image.figure_id, + "url": image.url or "", + "caption": image.title or image.figure_id, + "bbox": list(image.bbox), + } + # Optionally surface plain description separately (strip markup) if needed later. + # Since image.description now holds markup, we do not include it here by default. + if USE_MULTIMODAL and image.embedding is not None: + ref["imageEmbedding"] = image.embedding + image_refs.append(ref) + outputs.append( + { + "id": f"{normalized_id}-{idx:04d}", + "content": content, + "embedding": embedding_vec, + "sourcepage": BlobManager.sourcepage_from_file_page(file_name, section.chunk.page_num), + "sourcefile": file_name, + "parent_id": storage_url, + **({"images": image_refs} if image_refs else {}), + } + ) + + return outputs diff --git a/app/functions/text_processor/host.json b/app/functions/text_processor/host.json new file mode 100644 index 0000000000..656342971e --- /dev/null +++ b/app/functions/text_processor/host.json @@ -0,0 +1,20 @@ +{ + "version": "2.0", + "extensionBundle": { + "id": "Microsoft.Azure.Functions.ExtensionBundle", + "version": "[4.*, 5.0.0)" + }, + "functionTimeout": "00:05:00", + "logging": { + "logLevel": { + "default": "Information", + "Function": "Information" + }, + "applicationInsights": { + "samplingSettings": { + "isEnabled": true, + "maxTelemetryItemsPerSecond": 20 + } + } + } +} diff --git a/app/functions/text_processor/requirements.txt b/app/functions/text_processor/requirements.txt new file mode 100644 index 0000000000..4806076325 --- /dev/null +++ b/app/functions/text_processor/requirements.txt @@ -0,0 +1,9 @@ +# Azure Functions runtime +azure-functions>=1.21.3,<2.0.0 + +# Local dependency: prepdocslib +../../backend/prepdocslib + +# Note: prepdocslib's dependencies will be pulled in automatically +# including: azure-search-documents, azure-storage-blob, +# azure-identity, openai, tiktoken, etc. diff --git a/azure.yaml b/azure.yaml index f629d9a374..7de8ec5c3a 100644 --- a/azure.yaml +++ b/azure.yaml @@ -40,6 +40,18 @@ services: run: cd ../frontend;npm install;npm run build interactive: false continueOnError: false + document-extractor: + project: ./app/functions/document_extractor + language: py + host: function + figure-processor: + project: ./app/functions/figure_processor + language: py + host: function + text-processor: + project: ./app/functions/text_processor + language: py + host: function hooks: preprovision: windows: diff --git a/docs/cloud_ingestion.md b/docs/cloud_ingestion.md new file mode 100644 index 0000000000..35720b4e80 --- /dev/null +++ b/docs/cloud_ingestion.md @@ -0,0 +1,775 @@ +# Cloud-Based Data Ingestion with Azure Functions + +This document describes the cloud-based ingestion architecture that uses Azure Functions as custom skills for Azure AI Search indexers. + +## Overview + +The cloud ingestion strategy provides an alternative to the local script-based ingestion (`scripts/prepdocs.sh`). Instead of processing documents locally and uploading them to Azure AI Search, the cloud approach uses: + +1. **Azure Blob Storage** as the document source +2. **Azure AI Search Indexer** as the orchestration engine +3. **Three Azure Functions** acting as chained custom skills for document processing + +This architecture enables serverless, scalable, and event-driven document processing. + +## Architecture + +```ascii +┌─────────────────────────────────────────────────────────────────┐ +│ USER: Upload files to blob storage (content container) │ +└──────────────────────┬──────────────────────────────────────────┘ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Azure AI Search Indexer │ +│ - Blob data source (monitors content container) │ +│ - Skillset with 3 chained custom skills │ +│ - Runs on schedule or on-demand │ +│ - Handles retries, checkpointing, state tracking │ +└──────────────────────┬──────────────────────────────────────────┘ + │ + ▼ + ┌──────────────────────────────┐ + │ SKILL #1: document_extractor│ + │ (Flex Consumption Function) │ + │ HTTP Trigger │ + │ Timeout: 10 minutes │ + └─────────────┬────────────────┘ + │ + Input: │ Output: + • Blob URL │ • Markdown text with figure anchors + • File metadata │ • Page metadata (text + figure ids) + │ • Figures array (metadata + base64 image) + Processes: │ + • Download blob │ + • Document Intelligence + • Figure cropping (PyMuPDF) + • Table extraction + │ + ▼ + ┌─────────────────────────────┐ + │ SKILL #2: figure_processor │ + │ (Flex Consumption Function)│ + │ HTTP Trigger │ + │ Timeout: 6 minutes │ + │ Memory: 3072 MB │ + └─────────────┬───────────────┘ + │ + Context: │ Output: + • /document/figures/*│ • Figure url (blob SAS) + Input values: │ • Figure caption + • Figure bytes │ • Figure embedding vector + • Figure metadata │ + Processes: │ + • Upload to blob │ + • Describe via LLM │ + • Embed via Vision │ + │ + ▼ + ┌─────────────────────────────┐ + │ SKILL #3: text_processor │ + │ (Combines, splits, embeds) │ + │ HTTP Trigger │ + │ Timeout: 5 minutes │ + │ Memory: 2048 MB │ + └─────────────┬───────────────┘ + │ + Input: │ Output: + • Full markdown │ • Array of chunks with: + • Processed figures │ - Content text + • File metadata │ - Text embeddings + Processes: │ - Figure references + embeddings + • Enrich placeholders│ - Metadata (sourcepage, etc.) + • Split text │ + • Generate embeddings│ + │ + ▼ +┌─────────────────────────────────────────────────────────────────┐ +│ Azure AI Search Index │ +│ Indexer writes enriched documents with embeddings │ +└─────────────────────────────────────────────────────────────────┘ +``` + +## Components + +### 1. Document Extractor Function + +**Location:** `app/functions/document_extractor/` + +**Purpose:** First stage of processing—extracts structured content and raw figure payloads. + +**Responsibilities:** + +- Downloads documents from blob storage. +- Parses documents using Azure Document Intelligence or local fallbacks. +- Extracts tables as HTML fragments. +- Crops figure images with PyMuPDF and serialises them as base64 payloads. +- Emits markdown text containing `
` placeholders and companion metadata arrays. +- Captures per-page metadata linking text passages to figure identifiers. + +**Configuration:** + +- 10-minute timeout (supports large documents and multimodal preprocessing). +- 4096 MB instance memory (for document parsing and image manipulation). +- Python 3.11 runtime. +- Uses managed identity for authentication. + +**Input Format (Azure Search custom skill):** + +```json +{ + "values": [ + { + "recordId": "1", + "data": { + "blobUrl": "https://storage.../content/doc.pdf", + "fileName": "doc.pdf", + "contentType": "application/pdf" + } + } + ] +} +``` + +**Output Format:** + +```json +{ + "values": [ + { + "recordId": "1", + "data": { + "pages": [ + { + "pageNumber": 0, + "text": "Page text with
", + "figureIds": ["fig1"] + } + ], + + "figures": [ + { + "id": "fig1", + "page": 2, + "fileName": "doc.pdf", + "mimeType": "image/png", + "imageBase64": "iVBORw0...", + "bbox": [12.4, 30.1, 180.6, 210.2] + } + ], + "images": [] + }, + "errors": [], + "warnings": [] + } + ] +} +``` + +### 2. Figure Processor Function + +**Location:** `app/functions/figure_processor/` + +**Purpose:** Second stage—turns individual figure payloads into reusable assets and embeddings. + +**Responsibilities:** + +- Uploads figure bytes to blob storage and generates signed URLs or stored paths. +- Produces natural-language captions via GPT-4o or Content Understanding. +- Generates image embeddings via Azure AI Vision (when multimodal is enabled). +- Emits enriched figure metadata for downstream text processing. + +**Configuration:** + +- 6-minute timeout (covers caption plus embedding latency for complex figures). +- 3072 MB instance memory (accommodates concurrent figure batches). +- Python 3.11 runtime. +- Uses managed identity for authentication. + +**Input Format (context `/document/figures/*`):** Azure AI Search expands the skill context, calling the function once per figure and supplying a unique `recordId` for each entry in the `values` array. + +```json +{ + "values": [ + { + "recordId": "1", + "data": { + "id": "fig1", + "fileName": "doc.pdf", + "mimeType": "image/png", + "imageBase64": "iVBORw0...", + "page": 2, + "bbox": [12.4, 30.1, 180.6, 210.2] + } + } + ] +} +``` + +**Output Format:** + +```json +{ + "values": [ + { + "recordId": "1", + "data": { + "id": "fig1", + "url": "https://storage.../images/doc-fig1.png", + "caption": "Bar chart showing quarterly revenue", + "imageEmbedding": [0.789, -0.012, ...] + }, + "errors": [], + "warnings": [] + } + ] +} +``` + +### 3. Text Processor Function + +**Location:** `app/functions/text_processor/` + +**Purpose:** Third stage—recombines enriched figure metadata with text, then produces search-ready chunks. + +**Responsibilities:** + +- Merges processed figure metadata back into markdown placeholders. +- Preserves `
` positioning so figures stay with their surrounding narrative. +- Splits text into semantically meaningful chunks using `SentenceTextSplitter`. +- Generates text embeddings via Azure OpenAI. +- Emits chunk documents referencing figure descriptors and optional image embeddings. + +**Configuration:** + +- 5-minute timeout (sized for batching text embeddings). +- 2048 MB instance memory (increase if batching large embeddings). +- Python 3.11 runtime. +- Uses managed identity for authentication. + +Because the enrichment tree preserves the updated `/document/figures` collection after skill #2 runs, this skill receives a fully enriched array of figure descriptors alongside the markdown source. + +**Input Format:** + +```json +{ + "values": [ + { + "recordId": "1", + "data": { + "text": "# Document...
", + "tables": [...], + "figures": [ + { + "id": "fig1", + "url": "https://storage.../images/doc-fig1.png", + "caption": "Bar chart...", + "imageEmbedding": [0.789, -0.012, ...] + } + ], + "fileName": "doc.pdf" + } + } + ] +} +``` + +**Output Format:** + +```json +{ + "values": [ + { + "recordId": "1", + "data": { + "chunks": [ + { + "id": "doc.pdf-0001", + "content": "Content chunk with
", + "embedding": [0.123, -0.456, ...], + "sourcepage": "doc.pdf-1", + "sourcefile": "doc.pdf", + "images": [ + { + "id": "fig1", + "url": "https://storage.../images/doc-fig1.png", + "caption": "Bar chart...", + "imageEmbedding": [0.789, -0.012, ...] + } + ] + } + ] + }, + "errors": [], + "warnings": [] + } + ] +} + +**Record IDs:** The indexer maintains the original document `recordId` throughout the pipeline. Skills operating on collections (such as `/document/figures/*`) emit per-item suffixes internally, but every response still maps back to the same root document when the enrichment tree is reassembled. +``` + +### 4. Azure AI Search Indexer + +The indexer orchestrates the entire pipeline: + +**Data Source:** + +- Type: `azureblob` +- Container: `content` +- Monitors for new/modified blobs +- Can be configured to track deletions or soft delete markers + +**Skillset:** + +- Custom skill #1 (`/document` context): `document_extractor` (emits per-page text and figure payloads). +- Custom skill #2 (`/document/figures/*` context): `figure_processor` (fans out automatically so each figure is enriched independently before being merged back into `/document/figures`). +- Custom skill #3 (`/document` context): `text_processor` (combines markdown with enriched figures, then produces chunks/embeddings). +- Skill #3 consumes the per-page text output from skill #1 and the enriched figures output from skill #2. + +**Indexer:** + +- Runs on schedule (e.g., every 5 minutes) or on-demand +- Batch size: Configurable (e.g., 10 documents per batch) +- Handles retries with exponential backoff +- Tracks processing state per document +- Supports incremental updates (only processes changed documents) + +## Shared Code: prepdocslib + +All three functions share the same processing logic used by the local ingestion script. + +**Location:** `app/backend/prepdocslib/` + +**Shared Modules:** + +- `pdfparser.py` - Document Intelligence and local PDF parsing +- `htmlparser.py` - HTML parsing +- `textparser.py` - Plain text parsing +- `textsplitter.py` - `SentenceTextSplitter` for semantic chunking +- `embeddings.py` - Azure OpenAI and image embedding services +- `blobmanager.py` - Blob storage operations +- `mediadescriber.py` - Figure description using GPT-4o or Content Understanding + +**Deployment:** + +Each function includes `../../backend/prepdocslib` in `requirements.txt` as a local dependency. During deployment: + +1. `pip` resolves the local path dependency. +2. The function deployment packages `prepdocslib` with all its dependencies. +3. The complete package is uploaded to the function's deployment blob container. + +## Configuration + +### Environment Variables (Function Apps) + +Both functions receive the same configuration as the backend app: + +**Azure Services:** + +```bash +# Storage +AZURE_STORAGE_ACCOUNT= +AZURE_STORAGE_CONTAINER=content +AZURE_IMAGESTORAGE_CONTAINER=images + +# Azure OpenAI +AZURE_OPENAI_SERVICE= +AZURE_OPENAI_EMB_DEPLOYMENT= +AZURE_OPENAI_EMB_MODEL_NAME=text-embedding-3-large +AZURE_OPENAI_EMB_DIMENSIONS=3072 +AZURE_OPENAI_API_VERSION=2024-06-01 + +# Document Intelligence +AZURE_DOCUMENTINTELLIGENCE_SERVICE= + +# Azure AI Vision (for multimodal) +AZURE_VISION_ENDPOINT= + +# Azure AI Search +AZURE_SEARCH_SERVICE= +AZURE_SEARCH_INDEX=gptkbindex +``` + +**Custom Skill Endpoints:** + +```bash +DOCUMENT_EXTRACTOR_SKILL_ENDPOINT=https://... +DOCUMENT_EXTRACTOR_SKILL_RESOURCE_ID=api://... +FIGURE_PROCESSOR_SKILL_ENDPOINT=https://... +FIGURE_PROCESSOR_SKILL_RESOURCE_ID=api://... +TEXT_PROCESSOR_SKILL_ENDPOINT=https://... +TEXT_PROCESSOR_SKILL_RESOURCE_ID=api://... +``` + +**Feature Flags:** + +```bash +USE_VECTORS=true +USE_MULTIMODAL=false +USE_LOCAL_PDF_PARSER=false +USE_LOCAL_HTML_PARSER=false +USE_MEDIA_DESCRIBER_AZURE_CU=false +``` + +**Authentication:** +All functions use **managed identity** (no connection strings or keys). + +### Bicep Parameters + +**infra/main.parameters.json:** + +```json +{ + "useCloudIngestion": { + "value": "${USE_CLOUD_INGESTION=false}" + } +} +``` + +When `useCloudIngestion=true`: + +- Deploys three Azure Functions (document_extractor, figure_processor, text_processor) on the Flex Consumption plan. +- Creates managed identities with appropriate role assignments (Storage, Search, Document Intelligence, OpenAI, Vision). +- Provisions the indexer, skillset, and data source. +- Configures the backend to use cloud ingestion. + +## Local vs Cloud Ingestion + +### Local Ingestion (Default) + +**Command:** `./scripts/prepdocs.sh` + +**Process:** + +1. Run `prepdocs.py` locally. +2. Upload documents to blob storage. +3. Process documents locally (parse, split, embed). +4. Upload chunks directly to Azure AI Search index. +5. MD5 tracking to skip unchanged files. + +**Use Cases:** + +- Initial data seeding. +- Development and testing. +- CI/CD pipelines. +- Small datasets. +- When you need immediate control. + +**Pros:** + +- Simple, direct control. +- Fast for small datasets. +- Works offline (can process locally before uploading). + +**Cons:** + +- Not scalable for large datasets. +- Requires local compute resources. +- No automatic incremental updates. + +### Cloud Ingestion + +**Command:** `./scripts/prepdocs.sh --use-cloud-ingestion` + +**Process:** + +1. Upload documents to blob storage only. +2. Indexer automatically detects new/changed documents. +3. Functions process documents in parallel (figures and text scale independently). +4. Chunks written directly to search index. +5. Indexer tracks state (no MD5 needed). + +**Use Cases:** + +- Production environments. +- Large datasets. +- Continuous ingestion (monitoring blob container). +- Event-driven processing. +- Horizontal scaling requirements. + +**Pros:** + +- Serverless, scales automatically (up to 1000 instances). +- No local compute needed. +- Built-in retry and error handling. +- Incremental updates (only processes changes). +- Cost-effective (pay only when processing). + +**Cons:** + +- Slightly more complex setup. +- Depends on Azure services being available. +- Indexer scheduling introduces latency (configurable). + +## MD5 Tracking + +**Local ingestion** uses MD5 files to track uploaded documents: + +- MD5 hash stored in `data/*.md5` files. +- Used to skip re-uploading unchanged files to blob storage. +- Still needed even with cloud ingestion for initial uploads. + +**Cloud ingestion** does not need MD5 for processing: + +- Indexer uses blob `lastModified` timestamp. +- Automatically detects new and changed documents. +- No MD5 files created for processed chunks. + +## Deployment + +### Prerequisites + +1. Azure CLI with Functions extension: + + ```bash + az extension add --name functions + ``` + +1. Azure Functions Core Tools v4: + + ```bash + brew install azure-functions-core-tools@4 # macOS + ``` + +### Deploy Infrastructure + +```bash +azd provision +``` + +This creates: + +- Function App (Flex Consumption plan). +- Three function deployments (`document_extractor`, `figure_processor`, `text_processor`). +- Managed identities. +- Role assignments (Storage, OpenAI, Document Intelligence, Vision, Search). +- Indexer, skillset, and data source (if `USE_CLOUD_INGESTION=true`). + +### Deploy Function Code + +Functions are deployed as part of `azd up` / `azd deploy`. Manual `func azure functionapp publish` steps are not supported in this workflow—always let azd handle packaging, app settings, and managed identity assignments so that the skillset stays in sync with infrastructure as code. + +### Upload Initial Data + +```bash +# Upload documents (triggers indexer if cloud ingestion enabled) +./scripts/prepdocs.sh + +# Or explicitly use cloud ingestion +./scripts/prepdocs.sh --use-cloud-ingestion +``` + +## Monitoring + +### Application Insights + +All three functions send telemetry to Application Insights: + +- Request duration and success rate. +- Custom skill execution metrics. +- Error logs with stack traces. +- Performance counters. + +**View in Azure Portal:** + +1. Navigate to Function App → Application Insights. +2. Check "Live Metrics" for real-time monitoring. +3. Use "Failures" blade for error analysis. + +### Indexer Status + +Check indexer execution history: + +```bash +# Azure CLI +az search indexer show \ + --service-name \ + --name + +# Or via Azure Portal +# Navigate to Search Service → Indexers → View execution history +``` + +### Function Logs + +Stream function logs in real-time: + +```bash +func azure functionapp logstream +``` + +## Troubleshooting + +### Function Timeouts + +**Symptom:** Functions timing out on large documents + +**Solution:** + +- Increase `functionTimeout` in `host.json` (max 10 minutes for `document_extractor`). +- Increase instance memory (2048 MB or 4096 MB). +- Consider splitting very large documents before upload. + +### Embedding Rate Limits + +**Symptom:** HTTP 429 errors from OpenAI + +**Solution:** + +- The embedding service includes retry logic with exponential backoff. +- Reduce indexer batch size to process fewer documents concurrently. +- Increase OpenAI deployment capacity (TPM). + +### Missing Images + +**Symptom:** Figures not appearing in search results + +**Solution:** + +- Verify `USE_MULTIMODAL=true` is set. +- Check that images container has proper CORS settings. +- Verify function has "Storage Blob Data Contributor" role. +- Check Application Insights for image upload errors. + +### Indexer Failures + +**Symptom:** Indexer shows failed executions + +**Solution:** + +- Check indexer execution history for error details. +- Verify custom skill URLs are accessible (not 404). +- Check function authentication (managed identity or keys configured in skillset). +- Review function logs in Application Insights. + +## Cost Optimization + +### Function App + +**Flex Consumption Billing:** + +- Execution time × memory provisioned (GB-seconds). +- Number of executions. +- Always-ready instances (if configured). + +**Tips:** + +- Use 2048 MB memory unless you need more (text processing or multimodal workloads). +- Set appropriate timeouts (don't over-provision). +- Don't use always-ready instances for this workload (batch processing). + +### Indexer + +**Indexer Runs:** + +- Free tier: Limited indexer runs per day. +- Standard tier: Unlimited runs. + +**Tips:** + +- Adjust schedule based on upload frequency (don't run too frequently). +- Use on-demand indexer runs for manual uploads. +- Enable "high water mark" change detection (only processes new/changed docs). + +## Security + +### Managed Identities + +All authentication uses **managed identities** (no secrets): + +- Function App → Storage (read content, write images). +- Function App → OpenAI (embeddings, GPT-4o). +- Function App → Document Intelligence (parsing). +- Function App → Vision (figure analysis and captioning). +- Function App → Search (index writing). + +### Network Security + +Optional private networking: + +- Functions can be deployed in a Virtual Network. +- Private endpoints for Storage, OpenAI, Document Intelligence, Vision. +- Network isolation for production workloads. + +### Access Control + +Custom skills authenticate with **Microsoft Entra ID** using managed identities: + +- Azure AI Search calls each function using its system- or user-assigned managed identity and the skill's `authResourceId`. +- Each Function App enables App Service Authentication (Easy Auth) and trusts tokens issued for the registered application ID. +- Disable or avoid distributing function keys; they are unnecessary when managed identity is configured. + +## Performance + +### Throughput + +**Expected performance:** + +- Document extraction: 1-2 minutes per document (with multimodal). +- Figure processing: 10-20 seconds per document (depends on vision workload). +- Text processing and embedding: 10-30 seconds per document. +- End-to-end: 2-3 minutes per document. + +**Scaling:** + +- Indexer batch size: 10 documents (configurable). +- Function instances: Auto-scale based on load. +- Max concurrent executions: Limited by OpenAI TPM quota. + +### Optimization Tips + +1. **Batch uploads:** Upload multiple documents at once for parallel processing +2. **Pre-process documents:** Remove unnecessary content before upload +3. **Tune chunk size:** Balance between retrieval quality and processing time +4. **Use local parsers:** Faster but lower quality for simple documents + +## Migration from Local to Cloud Ingestion + +### Step-by-Step + +1. **Deploy infrastructure:** + + ```bash + azd env set USE_CLOUD_INGESTION true + azd provision + ``` + +1. **Test with sample documents:** + + ```bash + # Upload a few test documents + az storage blob upload-batch \ + -s ./data -d content \ + --account-name + ``` + +1. **Verify indexer runs:** + - Check Azure Portal → Search Service → Indexers + - Verify documents appear in index + +1. **Upload full dataset:** + + ```bash + ./scripts/prepdocs.sh --use-cloud-ingestion + ``` + +1. **Monitor progress:** + - Application Insights → Live Metrics + - Indexer execution history + +### Rollback + +To revert to local ingestion: + +```bash +azd env set USE_CLOUD_INGESTION false +./scripts/prepdocs.sh # Uses local processing +``` + +## References + +- [Azure AI Search Custom Skills](https://learn.microsoft.com/azure/search/cognitive-search-custom-skill-web-api) +- [Azure Functions Flex Consumption Plan](https://learn.microsoft.com/azure/azure-functions/flex-consumption-plan) +- [Azure AI Search Indexers](https://learn.microsoft.com/azure/search/search-indexer-overview) +- [Custom Skill Interface](https://learn.microsoft.com/azure/search/cognitive-search-custom-skill-interface) diff --git a/infra/app/functions-app.bicep b/infra/app/functions-app.bicep new file mode 100644 index 0000000000..7710c27050 --- /dev/null +++ b/infra/app/functions-app.bicep @@ -0,0 +1,97 @@ +// Single function app module +param name string +param location string = resourceGroup().location +param tags object = {} +param applicationInsightsName string +param appServicePlanId string +param appSettings object = {} +param runtimeName string +param runtimeVersion string +param storageAccountName string +param deploymentStorageContainerName string +param instanceMemoryMB int = 2048 +param maximumInstanceCount int = 100 +param identityId string +param identityClientId string +param functionTimeout string = '00:05:00' + +var identityType = 'UserAssigned' +var kind = 'functionapp,linux' +var applicationInsightsIdentity = 'ClientId=${identityClientId};Authorization=AAD' + +// Reference existing resources +resource stg 'Microsoft.Storage/storageAccounts@2023-05-01' existing = { + name: storageAccountName +} + +resource applicationInsights 'Microsoft.Insights/components@2020-02-02' existing = { + name: applicationInsightsName +} + +// Create base application settings +var baseAppSettings = { + // Storage credentials for AzureWebJobsStorage + AzureWebJobsStorage__credential: 'managedidentity' + AzureWebJobsStorage__clientId: identityClientId + AzureWebJobsStorage__blobServiceUri: stg.properties.primaryEndpoints.blob + AzureWebJobsStorage__queueServiceUri: stg.properties.primaryEndpoints.queue + AzureWebJobsStorage__tableServiceUri: stg.properties.primaryEndpoints.table + + // Application Insights + APPLICATIONINSIGHTS_AUTHENTICATION_STRING: applicationInsightsIdentity + APPLICATIONINSIGHTS_CONNECTION_STRING: applicationInsights.properties.ConnectionString + + // Function timeout + FUNCTIONS_EXTENSION_VERSION: '~4' + FUNCTIONS_WORKER_RUNTIME: runtimeName +} + +// Merge all app settings +var allAppSettings = union(appSettings, baseAppSettings) + +// Create Flex Consumption Function App using AVM +module functionApp 'br/public:avm/res/web/site:0.15.1' = { + name: '${name}-func-app' + params: { + kind: kind + name: name + location: location + tags: tags + serverFarmResourceId: appServicePlanId + managedIdentities: { + userAssignedResourceIds: [identityId] + } + functionAppConfig: { + deployment: { + storage: { + type: 'blobContainer' + value: '${stg.properties.primaryEndpoints.blob}${deploymentStorageContainerName}' + authentication: { + type: identityType + userAssignedIdentityResourceId: identityId + } + } + } + scaleAndConcurrency: { + instanceMemoryMB: instanceMemoryMB + maximumInstanceCount: maximumInstanceCount + } + runtime: { + name: runtimeName + version: runtimeVersion + } + } + siteConfig: { + alwaysOn: false + functionAppScaleLimit: maximumInstanceCount + cors: { + allowedOrigins: ['https://portal.azure.com'] + } + } + appSettingsKeyValuePairs: allAppSettings + } +} + +// Outputs +output name string = functionApp.outputs.name +output defaultHostname string = functionApp.outputs.defaultHostname diff --git a/infra/app/functions-rbac.bicep b/infra/app/functions-rbac.bicep new file mode 100644 index 0000000000..72e39a6bf2 --- /dev/null +++ b/infra/app/functions-rbac.bicep @@ -0,0 +1,130 @@ +// RBAC assignments for function apps +param principalId string +param storageResourceGroupName string +param searchServiceResourceGroupName string +param openAiResourceGroupName string +param documentIntelligenceResourceGroupName string +param visionServiceName string = '' +param visionResourceGroupName string = '' +param contentUnderstandingServiceName string = '' +param contentUnderstandingResourceGroupName string = '' +param useMultimodal bool + +// Role Definition IDs +var storageBlobDataReaderRoleId = '2a2b9908-6ea1-4ae2-8e65-a410df84e7d1' // Read content container +var storageBlobDataContributorRoleId = 'ba92f5b4-2d11-453d-a403-e96b0029c9fe' // Write images container +var storageQueueDataContributorRoleId = '974c5e8b-45b9-4653-ba55-5f855dd0fb88' // For AzureWebJobsStorage +var storageTableDataContributorRoleId = '0a9a7e1f-b9d0-4cc4-a60d-0319b160aaa3' // For AzureWebJobsStorage +var cognitiveServicesOpenAIUserRoleId = '5e0bd9bd-7b93-4f28-af87-19fc36ad61bd' // OpenAI access +var cognitiveServicesUserRoleId = 'a97b65f3-24c7-4388-baec-2e87135dc908' // Document Intelligence, Vision, CU +var searchIndexDataContributorRoleId = '8ebe5a00-799e-43f5-93ac-243d3dce84a7' // Write to search index +var monitoringMetricsPublisherRoleId = '3913510d-42f4-4e42-8a64-420c390055eb' // Application Insights + +// Storage: Blob Data Reader (read content container) +module storageBlobReaderRole '../core/security/role.bicep' = { + scope: resourceGroup(storageResourceGroupName) + name: 'storage-blob-reader-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: storageBlobDataReaderRoleId + principalType: 'ServicePrincipal' + } +} + +// Storage: Blob Data Contributor (write images container, deployment container) +module storageBlobContributorRole '../core/security/role.bicep' = { + scope: resourceGroup(storageResourceGroupName) + name: 'storage-blob-contributor-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: storageBlobDataContributorRoleId + principalType: 'ServicePrincipal' + } +} + +// Storage: Queue Data Contributor (for AzureWebJobsStorage) +module storageQueueContributorRole '../core/security/role.bicep' = { + scope: resourceGroup(storageResourceGroupName) + name: 'storage-queue-contributor-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: storageQueueDataContributorRoleId + principalType: 'ServicePrincipal' + } +} + +// Storage: Table Data Contributor (for AzureWebJobsStorage) +module storageTableContributorRole '../core/security/role.bicep' = { + scope: resourceGroup(storageResourceGroupName) + name: 'storage-table-contributor-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: storageTableDataContributorRoleId + principalType: 'ServicePrincipal' + } +} + +// Search: Index Data Contributor (write chunks to index) +module searchIndexContributorRole '../core/security/role.bicep' = { + scope: resourceGroup(searchServiceResourceGroupName) + name: 'search-index-contributor-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: searchIndexDataContributorRoleId + principalType: 'ServicePrincipal' + } +} + +// OpenAI: Cognitive Services OpenAI User +module openAiUserRole '../core/security/role.bicep' = { + scope: resourceGroup(openAiResourceGroupName) + name: 'openai-user-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: cognitiveServicesOpenAIUserRoleId + principalType: 'ServicePrincipal' + } +} + +// Document Intelligence: Cognitive Services User +module documentIntelligenceUserRole '../core/security/role.bicep' = { + scope: resourceGroup(documentIntelligenceResourceGroupName) + name: 'doc-intelligence-user-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: cognitiveServicesUserRoleId + principalType: 'ServicePrincipal' + } +} + +// Vision: Cognitive Services User (if multimodal) +module visionUserRole '../core/security/role.bicep' = if (useMultimodal && !empty(visionServiceName)) { + scope: resourceGroup(visionResourceGroupName) + name: 'vision-user-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: cognitiveServicesUserRoleId + principalType: 'ServicePrincipal' + } +} + +// Content Understanding: Cognitive Services User (if multimodal) +module contentUnderstandingUserRole '../core/security/role.bicep' = if (useMultimodal && !empty(contentUnderstandingServiceName)) { + scope: resourceGroup(contentUnderstandingResourceGroupName) + name: 'content-understanding-user-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: cognitiveServicesUserRoleId + principalType: 'ServicePrincipal' + } +} + +// Application Insights: Monitoring Metrics Publisher +module appInsightsMetricsPublisherRole '../core/security/role.bicep' = { + name: 'appinsights-metrics-${uniqueString(principalId)}' + params: { + principalId: principalId + roleDefinitionId: monitoringMetricsPublisherRoleId + principalType: 'ServicePrincipal' + } +} diff --git a/infra/app/functions.bicep b/infra/app/functions.bicep new file mode 100644 index 0000000000..d8a6f8b06a --- /dev/null +++ b/infra/app/functions.bicep @@ -0,0 +1,291 @@ +// Parameters for both function apps +param location string = resourceGroup().location +param tags object = {} +param applicationInsightsName string +param storageAccountName string +param storageResourceGroupName string +param searchServiceName string +param searchServiceResourceGroupName string +param openAiServiceName string +param openAiResourceGroupName string +param documentIntelligenceServiceName string +param documentIntelligenceResourceGroupName string +param visionServiceName string = '' +param visionResourceGroupName string = '' +param contentUnderstandingServiceName string = '' +param contentUnderstandingResourceGroupName string = '' + +// Function App Names +param documentExtractorName string +param figureProcessorName string +param textProcessorName string + +// Shared configuration +param useVectors bool +param useMultimodal bool +param useLocalPdfParser bool +param useLocalHtmlParser bool +param useMediaDescriberAzureCU bool +param searchIndexName string +param searchFieldNameEmbedding string +param openAiEmbDeployment string +param openAiEmbModelName string +param openAiEmbDimensions int +param openAiApiVersion string +param openAiChatDeployment string +param openAiChatModelName string +param openAiCustomUrl string + +var abbrs = loadJsonContent('../abbreviations.json') +var resourceToken = toLower(uniqueString(subscription().id, resourceGroup().id, location)) + +// Common app settings for both functions +var commonAppSettings = { + // Storage + AZURE_STORAGE_ACCOUNT: storageAccountName + AZURE_STORAGE_CONTAINER: 'content' + AZURE_IMAGESTORAGE_CONTAINER: 'images' + + // Azure OpenAI + AZURE_OPENAI_SERVICE: openAiServiceName + AZURE_OPENAI_EMB_DEPLOYMENT: openAiEmbDeployment + AZURE_OPENAI_EMB_MODEL_NAME: openAiEmbModelName + AZURE_OPENAI_EMB_DIMENSIONS: string(openAiEmbDimensions) + AZURE_OPENAI_API_VERSION: openAiApiVersion + AZURE_OPENAI_CHATGPT_DEPLOYMENT: openAiChatDeployment + AZURE_OPENAI_CHATGPT_MODEL: openAiChatModelName + AZURE_OPENAI_CUSTOM_URL: openAiCustomUrl + + // Azure AI Search + AZURE_SEARCH_SERVICE: searchServiceName + AZURE_SEARCH_INDEX: searchIndexName + AZURE_SEARCH_FIELD_NAME_EMBEDDING: searchFieldNameEmbedding + + // Document Intelligence + AZURE_DOCUMENTINTELLIGENCE_SERVICE: documentIntelligenceServiceName + + // Feature flags + USE_VECTORS: string(useVectors) + USE_MULTIMODAL: string(useMultimodal) + USE_LOCAL_PDF_PARSER: string(useLocalPdfParser) + USE_LOCAL_HTML_PARSER: string(useLocalHtmlParser) + USE_MEDIA_DESCRIBER_AZURE_CU: string(useMediaDescriberAzureCU) +} + +// Add optional vision settings +var visionSettings = useMultimodal && !empty(visionServiceName) ? { + AZURE_VISION_ENDPOINT: 'https://${visionServiceName}.cognitiveservices.azure.com/' +} : {} + +// Add optional content understanding settings +var contentUnderstandingSettings = useMultimodal && !empty(contentUnderstandingServiceName) ? { + AZURE_CONTENTUNDERSTANDING_ENDPOINT: 'https://${contentUnderstandingServiceName}.cognitiveservices.azure.com/' +} : {} + +// Merge all settings +var allAppSettings = union(commonAppSettings, visionSettings, contentUnderstandingSettings) + +// Deployment storage containers +var documentExtractorDeploymentContainer = 'deploy-doc-extractor-${take(resourceToken, 7)}' +var figureProcessorDeploymentContainer = 'deploy-figure-processor-${take(resourceToken, 7)}' +var textProcessorDeploymentContainer = 'deploy-text-processor-${take(resourceToken, 7)}' + +// Create deployment containers in storage account +module deploymentContainers 'br/public:avm/res/storage/storage-account:0.8.3' = { + name: 'function-deployment-containers' + scope: resourceGroup(storageResourceGroupName) + params: { + name: storageAccountName + location: location + blobServices: { + containers: [ + { name: documentExtractorDeploymentContainer } + { name: figureProcessorDeploymentContainer } + { name: textProcessorDeploymentContainer } + ] + } + } +} + +// User-assigned managed identity for document extractor +module documentExtractorIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.4.1' = { + name: 'doc-extractor-identity' + params: { + location: location + tags: tags + name: '${abbrs.managedIdentityUserAssignedIdentities}doc-extractor-${resourceToken}' + } +} + +// User-assigned managed identity for text processor +module textProcessorIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.4.1' = { + name: 'text-processor-identity' + params: { + location: location + tags: tags + name: '${abbrs.managedIdentityUserAssignedIdentities}text-processor-${resourceToken}' + } +} + +// User-assigned managed identity for figure processor +module figureProcessorIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.4.1' = { + name: 'figure-processor-identity' + params: { + location: location + tags: tags + name: '${abbrs.managedIdentityUserAssignedIdentities}figure-processor-${resourceToken}' + } +} + +// App Service Plan (Flex Consumption) +module appServicePlan 'br/public:avm/res/web/serverfarm:0.1.1' = { + name: 'functions-plan' + params: { + name: '${abbrs.webServerFarms}functions-${resourceToken}' + sku: { + name: 'FC1' + tier: 'FlexConsumption' + } + reserved: true // Required for Linux + location: location + tags: tags + } +} + +// Document Extractor Function App +module documentExtractor 'functions-app.bicep' = { + name: 'document-extractor-func' + params: { + name: documentExtractorName + location: location + tags: union(tags, { 'azd-service-name': 'document-extractor' }) + applicationInsightsName: applicationInsightsName + appServicePlanId: appServicePlan.outputs.resourceId + runtimeName: 'python' + runtimeVersion: '3.11' + storageAccountName: storageAccountName + deploymentStorageContainerName: documentExtractorDeploymentContainer + identityId: documentExtractorIdentity.outputs.resourceId + identityClientId: documentExtractorIdentity.outputs.clientId + appSettings: allAppSettings + instanceMemoryMB: 4096 // High memory for document processing + maximumInstanceCount: 100 + functionTimeout: '00:10:00' // 10 minutes for long-running extraction + } + dependsOn: [ + deploymentContainers + ] +} + +// Figure Processor Function App +module figureProcessor 'functions-app.bicep' = { + name: 'figure-processor-func' + params: { + name: figureProcessorName + location: location + tags: union(tags, { 'azd-service-name': 'figure-processor' }) + applicationInsightsName: applicationInsightsName + appServicePlanId: appServicePlan.outputs.resourceId + runtimeName: 'python' + runtimeVersion: '3.11' + storageAccountName: storageAccountName + deploymentStorageContainerName: figureProcessorDeploymentContainer + identityId: figureProcessorIdentity.outputs.resourceId + identityClientId: figureProcessorIdentity.outputs.clientId + appSettings: allAppSettings + instanceMemoryMB: 2048 + maximumInstanceCount: 100 + functionTimeout: '00:05:00' + } + dependsOn: [ + deploymentContainers + ] +} + +// Text Processor Function App +module textProcessor 'functions-app.bicep' = { + name: 'text-processor-func' + params: { + name: textProcessorName + location: location + tags: union(tags, { 'azd-service-name': 'text-processor' }) + applicationInsightsName: applicationInsightsName + appServicePlanId: appServicePlan.outputs.resourceId + runtimeName: 'python' + runtimeVersion: '3.11' + storageAccountName: storageAccountName + deploymentStorageContainerName: textProcessorDeploymentContainer + identityId: textProcessorIdentity.outputs.resourceId + identityClientId: textProcessorIdentity.outputs.clientId + appSettings: allAppSettings + instanceMemoryMB: 2048 // Standard memory for embedding + maximumInstanceCount: 100 + functionTimeout: '00:05:00' // 5 minutes default + } + dependsOn: [ + deploymentContainers + ] +} + +// RBAC: Document Extractor Roles +module documentExtractorRbac 'functions-rbac.bicep' = { + name: 'doc-extractor-rbac' + params: { + principalId: documentExtractorIdentity.outputs.principalId + storageResourceGroupName: storageResourceGroupName + searchServiceResourceGroupName: searchServiceResourceGroupName + openAiResourceGroupName: openAiResourceGroupName + documentIntelligenceResourceGroupName: documentIntelligenceResourceGroupName + visionServiceName: visionServiceName + visionResourceGroupName: visionResourceGroupName + contentUnderstandingServiceName: contentUnderstandingServiceName + contentUnderstandingResourceGroupName: contentUnderstandingResourceGroupName + useMultimodal: useMultimodal + } +} + +// RBAC: Text Processor Roles +module textProcessorRbac 'functions-rbac.bicep' = { + name: 'text-processor-rbac' + params: { + principalId: textProcessorIdentity.outputs.principalId + storageResourceGroupName: storageResourceGroupName + searchServiceResourceGroupName: searchServiceResourceGroupName + openAiResourceGroupName: openAiResourceGroupName + documentIntelligenceResourceGroupName: documentIntelligenceResourceGroupName + visionServiceName: visionServiceName + visionResourceGroupName: visionResourceGroupName + contentUnderstandingServiceName: contentUnderstandingServiceName + contentUnderstandingResourceGroupName: contentUnderstandingResourceGroupName + useMultimodal: useMultimodal + } +} + +// RBAC: Figure Processor Roles +module figureProcessorRbac 'functions-rbac.bicep' = { + name: 'figure-processor-rbac' + params: { + principalId: figureProcessorIdentity.outputs.principalId + storageResourceGroupName: storageResourceGroupName + searchServiceResourceGroupName: searchServiceResourceGroupName + openAiResourceGroupName: openAiResourceGroupName + documentIntelligenceResourceGroupName: documentIntelligenceResourceGroupName + visionServiceName: visionServiceName + visionResourceGroupName: visionResourceGroupName + contentUnderstandingServiceName: contentUnderstandingServiceName + contentUnderstandingResourceGroupName: contentUnderstandingResourceGroupName + useMultimodal: useMultimodal + } +} + +// Outputs +output documentExtractorName string = documentExtractor.outputs.name +output documentExtractorUrl string = documentExtractor.outputs.defaultHostname +output documentExtractorIdentityPrincipalId string = documentExtractorIdentity.outputs.principalId +output figureProcessorName string = figureProcessor.outputs.name +output figureProcessorUrl string = figureProcessor.outputs.defaultHostname +output figureProcessorIdentityPrincipalId string = figureProcessorIdentity.outputs.principalId +output textProcessorName string = textProcessor.outputs.name +output textProcessorUrl string = textProcessor.outputs.defaultHostname +output textProcessorIdentityPrincipalId string = textProcessorIdentity.outputs.principalId +output appServicePlanId string = appServicePlan.outputs.resourceId diff --git a/infra/main.bicep b/infra/main.bicep index 030acab981..a1dcfce1ce 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -130,6 +130,7 @@ param speechServiceSkuName string // Set in main.parameters.json param speechServiceVoice string = '' param useMultimodal bool = false param useEval bool = false +param useCloudIngestion bool = false @allowed(['free', 'provisioned', 'serverless']) param cosmosDbSkuName string // Set in main.parameters.json @@ -657,6 +658,46 @@ module acaAuth 'core/host/container-apps-auth.bicep' = if (deploymentTarget == ' } } +// FUNCTION APPS FOR CLOUD INGESTION +module functions 'app/functions.bicep' = if (useCloudIngestion) { + name: 'functions' + scope: resourceGroup + params: { + location: location + tags: tags + applicationInsightsName: useApplicationInsights ? monitoring.outputs.applicationInsightsName : '' + storageAccountName: storage.outputs.name + storageResourceGroupName: storageResourceGroup.name + searchServiceName: searchService.outputs.name + searchServiceResourceGroupName: searchServiceResourceGroup.name + openAiServiceName: isAzureOpenAiHost ? openAi.outputs.name : '' + openAiResourceGroupName: openAiResourceGroup.name + documentIntelligenceServiceName: documentIntelligence.outputs.name + documentIntelligenceResourceGroupName: documentIntelligenceResourceGroup.name + visionServiceName: useMultimodal ? vision.outputs.name : '' + visionResourceGroupName: useMultimodal ? visionResourceGroup.name : resourceGroup.name + contentUnderstandingServiceName: useMediaDescriberAzureCU ? contentUnderstanding.outputs.name : '' + contentUnderstandingResourceGroupName: useMediaDescriberAzureCU ? contentUnderstandingResourceGroup.name : resourceGroup.name + documentExtractorName: '${abbrs.webSitesFunctions}doc-extractor-${resourceToken}' + figureProcessorName: '${abbrs.webSitesFunctions}figure-processor-${resourceToken}' + textProcessorName: '${abbrs.webSitesFunctions}text-processor-${resourceToken}' + useVectors: ragSearchTextEmbeddings || ragSearchImageEmbeddings + useMultimodal: useMultimodal + useLocalPdfParser: useLocalPdfParser + useLocalHtmlParser: useLocalHtmlParser + useMediaDescriberAzureCU: useMediaDescriberAzureCU + searchIndexName: searchIndexName + searchFieldNameEmbedding: searchFieldNameEmbedding + openAiEmbDeployment: embedding.deploymentName + openAiEmbModelName: embedding.modelName + openAiEmbDimensions: embedding.dimensions + openAiApiVersion: azureOpenAiApiVersion + openAiChatDeployment: chatGpt.deploymentName + openAiChatModelName: chatGpt.modelName + openAiCustomUrl: azureOpenAiCustomUrl + } +} + var defaultOpenAiDeployments = [ { name: chatGpt.deploymentName diff --git a/infra/main.parameters.json b/infra/main.parameters.json index 7a637c8022..899440485d 100644 --- a/infra/main.parameters.json +++ b/infra/main.parameters.json @@ -209,6 +209,9 @@ "useEval": { "value": "${USE_EVAL=false}" }, + "useCloudIngestion": { + "value": "${USE_CLOUD_INGESTION=false}" + }, "enableLanguagePicker": { "value": "${ENABLE_LANGUAGE_PICKER=false}" }, diff --git a/tests/test_mediadescriber.py b/tests/test_mediadescriber.py index 2f767f712e..6822e28468 100644 --- a/tests/test_mediadescriber.py +++ b/tests/test_mediadescriber.py @@ -68,7 +68,7 @@ def mock_get(self, url, **kwargs): "startPageNumber": 1, "endPageNumber": 1, "unit": "pixel", - "pages": [{"pageNumber": 1}], + "pages": [{"pageNumber": 0}], } ], }, diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py index e22c4d9e7b..5ed36bdeb0 100644 --- a/tests/test_pdfparser.py +++ b/tests/test_pdfparser.py @@ -21,12 +21,9 @@ from azure.core.exceptions import HttpResponseError from PIL import Image, ImageChops -from prepdocslib.mediadescriber import ( - ContentUnderstandingDescriber, - MultimodalModelDescriber, -) +from prepdocslib.figureprocessor import FigureProcessor, MediaDescriptionStrategy from prepdocslib.page import ImageOnPage -from prepdocslib.pdfparser import DocumentAnalysisParser, MediaDescriptionStrategy +from prepdocslib.pdfparser import DocumentAnalysisParser from .mocks import MockAzureCredential @@ -114,15 +111,16 @@ def test_table_to_html_with_spans(): @pytest.mark.asyncio async def test_process_figure_without_bounding_regions(): - doc = MagicMock() figure = DocumentFigure(id="1", caption=None, bounding_regions=None) - media_describer = MagicMock() - - result = await DocumentAnalysisParser.process_figure(doc, figure, media_describer) - expected_html = "
1
" + result = await DocumentAnalysisParser.process_figure(None, figure) assert isinstance(result, ImageOnPage) - assert result.description == expected_html + assert result.description == "" + assert result.title == "" + assert result.figure_id == "1" + assert result.page_num == 0 + assert result.bbox == (0, 0, 0, 0) + assert result.filename == "figure1.png" @pytest.mark.asyncio @@ -136,13 +134,6 @@ async def test_process_figure_with_bounding_regions(monkeypatch, caplog): BoundingRegion(page_number=2, polygon=[1.4703, 2.8371, 5.5409, 2.8415, 5.5381, 6.6022, 1.4681, 6.5978]), ], ) - media_describer = AsyncMock() - - async def mock_describe_image(image_bytes): - assert image_bytes == b"image_bytes" - return "Described Image" - - monkeypatch.setattr(media_describer, "describe_image", mock_describe_image) def mock_crop_image_from_pdf_page(doc, page_number, bounding_box): assert page_number == 0 @@ -152,11 +143,11 @@ def mock_crop_image_from_pdf_page(doc, page_number, bounding_box): monkeypatch.setattr(DocumentAnalysisParser, "crop_image_from_pdf_page", mock_crop_image_from_pdf_page) with caplog.at_level(logging.WARNING): - result = await DocumentAnalysisParser.process_figure(doc, figure, media_describer) - expected_html = "
1 Logo
Described Image
" + result = await DocumentAnalysisParser.process_figure(doc, figure) assert isinstance(result, ImageOnPage) - assert result.description == expected_html + assert result.description == "" + assert result.title == "Logo" assert result.bytes == b"image_bytes" assert result.page_num == 0 assert result.figure_id == "1" @@ -186,7 +177,6 @@ async def mock_poller_result(): parser = DocumentAnalysisParser( endpoint="https://example.com", credential=MockAzureCredential(), - media_description_strategy=MediaDescriptionStrategy.NONE, ) content = io.BytesIO(b"pdf content bytes") content.name = "test.pdf" @@ -259,7 +249,6 @@ async def mock_poller_result(): parser = DocumentAnalysisParser( endpoint="https://example.com", credential=MockAzureCredential(), - media_description_strategy=MediaDescriptionStrategy.NONE, ) with open(TEST_DATA_DIR / "Simple Table.pdf", "rb") as f: content = io.BytesIO(f.read()) @@ -304,16 +293,9 @@ async def mock_poller_result(): monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document) monkeypatch.setattr(mock_poller, "result", mock_poller_result) - async def mock_describe_image(self, image_bytes): - return "Pie chart" - - monkeypatch.setattr(ContentUnderstandingDescriber, "describe_image", mock_describe_image) - parser = DocumentAnalysisParser( endpoint="https://example.com", credential=MockAzureCredential(), - media_description_strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, - content_understanding_endpoint="https://example.com", ) with open(TEST_DATA_DIR / "Simple Figure.pdf", "rb") as f: @@ -327,8 +309,9 @@ async def mock_describe_image(self, image_bytes): assert pages[0].offset == 0 assert ( pages[0].text - == "# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n
1.1 Figure 1
Pie chart
\n\n\nThis is text after the figure that's not part of it." + == '# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n
\n\n\nThis is text after the figure that\'s not part of it.' ) + assert pages[0].images[0].placeholder == '
' @pytest.mark.asyncio @@ -376,14 +359,12 @@ async def mock_poller_result(): parser = DocumentAnalysisParser( endpoint="https://example.com", credential=MockAzureCredential(), - media_description_strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, - content_understanding_endpoint="https://example.com", ) content = io.BytesIO(b"pdf content bytes") content.name = "test.docx" with caplog.at_level(logging.ERROR): pages = [page async for page in parser.parse(content)] - assert "This document type does not support media description." in caplog.text + assert "does not support high-resolution figure extraction" in caplog.text assert len(pages) == 1 assert pages[0].page_num == 0 @@ -392,75 +373,59 @@ async def mock_poller_result(): @pytest.mark.asyncio -async def test_parse_doc_with_openai(monkeypatch): - mock_poller = MagicMock() - - async def mock_begin_analyze_document(self, model_id, analyze_request, **kwargs): - return mock_poller - - async def mock_poller_result(): - content = open(TEST_DATA_DIR / "Simple Figure_content.txt").read() - return AnalyzeResult( - content=content, - pages=[DocumentPage(page_number=1, spans=[DocumentSpan(offset=0, length=148)])], - figures=[ - DocumentFigure( - id="1.1", - caption=DocumentCaption(content="Figure 1"), - bounding_regions=[ - BoundingRegion( - page_number=1, polygon=[0.4295, 1.3072, 1.7071, 1.3076, 1.7067, 2.6088, 0.4291, 2.6085] - ) - ], - spans=[DocumentSpan(offset=70, length=22)], - ) - ], - ) - - monkeypatch.setattr(DocumentIntelligenceClient, "begin_analyze_document", mock_begin_analyze_document) - monkeypatch.setattr(mock_poller, "result", mock_poller_result) +async def test_figure_processor_openai_requires_client(): + figure_processor = FigureProcessor(strategy=MediaDescriptionStrategy.OPENAI) - async def mock_describe_image(self, image_bytes): - return "Pie chart" + with pytest.raises(ValueError, match="requires both a client and a model name"): + await figure_processor.describe(b"bytes") - monkeypatch.setattr(MultimodalModelDescriber, "describe_image", mock_describe_image) - parser = DocumentAnalysisParser( - endpoint="https://example.com", - credential=MockAzureCredential(), - media_description_strategy=MediaDescriptionStrategy.OPENAI, +@pytest.mark.asyncio +async def test_figure_processor_openai_describe(monkeypatch): + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.OPENAI, openai_client=Mock(), openai_model="gpt-4o", openai_deployment="gpt-4o", ) - with open(TEST_DATA_DIR / "Simple Figure.pdf", "rb") as f: - content = io.BytesIO(f.read()) - content.name = "Simple Figure.pdf" + describer = AsyncMock() + describer.describe_image.return_value = "Pie chart" - pages = [page async for page in parser.parse(content)] + async def fake_get_media_describer(self): + return describer - assert len(pages) == 1 - assert pages[0].page_num == 0 - assert pages[0].offset == 0 - assert ( - pages[0].text - == "# Simple Figure\n\nThis text is before the figure and NOT part of it.\n\n\n
1.1 Figure 1
Pie chart
\n\n\nThis is text after the figure that's not part of it." - ) + monkeypatch.setattr(FigureProcessor, "get_media_describer", fake_get_media_describer) + + result = await figure_processor.describe(b"bytes") + + assert result == "Pie chart" + describer.describe_image.assert_awaited_once() @pytest.mark.asyncio -async def test_parse_doc_with_openai_missing_parameters(): - parser = DocumentAnalysisParser( - endpoint="https://example.com", +async def test_figure_processor_content_understanding_initializes_once(monkeypatch): + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, credential=MockAzureCredential(), - media_description_strategy=MediaDescriptionStrategy.OPENAI, - # Intentionally not providing openai_client and openai_model + content_understanding_endpoint="https://example.com", ) - content = io.BytesIO(b"pdf content bytes") - content.name = "test.pdf" + class FakeDescriber: + def __init__(self, endpoint, credential): + self.endpoint = endpoint + self.credential = credential + self.create_analyzer = AsyncMock() + self.describe_image = AsyncMock(return_value="A diagram") + + monkeypatch.setattr("prepdocslib.figureprocessor.ContentUnderstandingDescriber", FakeDescriber) + + result_first = await figure_processor.describe(b"image") + assert result_first == "A diagram" + describer_instance = figure_processor._media_describer # type: ignore[attr-defined] + assert isinstance(describer_instance, FakeDescriber) + describer_instance.create_analyzer.assert_awaited_once() - with pytest.raises(ValueError, match="OpenAI client must be provided when using OpenAI media description strategy"): - # Call the first iteration of the generator without using async for - await parser.parse(content).__anext__() + result_second = await figure_processor.describe(b"image") + assert result_second == "A diagram" + assert describer_instance.create_analyzer.await_count == 1 From b971ca7b1c70ea0ddda708910c723554e13e7d79 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 3 Nov 2025 13:34:33 -0800 Subject: [PATCH 02/30] More Bicep to get funcs deployed with auth --- AGENTS.md | 9 -- app/backend/prepdocs.py | 7 +- .../prepdocslib/cloudingestionstrategy.py | 24 +++- app/backend/prepdocslib/filestrategy.py | 4 +- .../integratedvectorizerstrategy.py | 2 +- app/backend/prepdocslib/searchmanager.py | 13 +- app/backend/prepdocslib/servicesetup.py | 7 +- infra/app/functions-app.bicep | 64 +++++++-- infra/app/functions.bicep | 124 ++++++++++++++---- infra/app/storage-containers.bicep | 24 ++++ infra/bicepconfig.json | 5 + infra/core/auth/appregistration.bicep | 91 +++++++++++++ infra/core/auth/appupdate.bicep | 42 ++++++ infra/main.bicep | 39 ++++-- tests/test_searchmanager.py | 5 +- 15 files changed, 384 insertions(+), 76 deletions(-) create mode 100644 infra/app/storage-containers.bicep create mode 100644 infra/bicepconfig.json create mode 100644 infra/core/auth/appregistration.bicep create mode 100644 infra/core/auth/appupdate.bicep diff --git a/AGENTS.md b/AGENTS.md index c3b0a4191d..6dc856ce38 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -52,15 +52,6 @@ When adding new azd environment variables, update: 1. .azdo/pipelines/azure-dev.yml: Add the new environment variable under `env` section 1. .github/workflows/azure-dev.yml: Add the new environment variable under `env` section -For cloud ingestion, `prepdocs.py --use-cloud-ingestion` expects the function endpoints and managed identity resource IDs in the azd environment. The search service must have a system- or user-assigned managed identity with access to the Azure Functions app: - -* `DOCUMENT_EXTRACTOR_SKILL_ENDPOINT` -* `DOCUMENT_EXTRACTOR_SKILL_RESOURCE_ID` -* `FIGURE_PROCESSOR_SKILL_ENDPOINT` -* `FIGURE_PROCESSOR_SKILL_RESOURCE_ID` -* `TEXT_PROCESSOR_SKILL_ENDPOINT` -* `TEXT_PROCESSOR_SKILL_RESOURCE_ID` - ## Adding a new setting to "Developer Settings" in RAG app When adding a new developer setting, update: diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 6770ca310a..c1e68d318d 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -286,11 +286,6 @@ async def main(strategy: Strategy, setup_index: bool = True): parser.add_argument( "--disablebatchvectors", action="store_true", help="Don't compute embeddings in batch for the sections" ) - parser.add_argument( - "--use-cloud-ingestion", - action="store_true", - help="Use Azure AI Search indexer with cloud-hosted custom skills instead of local ingestion", - ) parser.add_argument( "--remove", action="store_true", @@ -345,7 +340,7 @@ async def main(strategy: Strategy, setup_index: bool = True): use_acls = os.getenv("AZURE_USE_AUTHENTICATION", "").lower() == "true" enforce_access_control = os.getenv("AZURE_ENFORCE_ACCESS_CONTROL", "").lower() == "true" enable_global_documents = os.getenv("AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS", "").lower() == "true" - use_cloud_ingestion = args.use_cloud_ingestion or os.getenv("USE_CLOUD_INGESTION", "").lower() == "true" + use_cloud_ingestion = os.getenv("USE_CLOUD_INGESTION", "").lower() == "true" dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false" use_agentic_retrieval = os.getenv("USE_AGENTIC_RETRIEVAL", "").lower() == "true" use_content_understanding = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "").lower() == "true" diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py index 7139ee6412..80df119400 100644 --- a/app/backend/prepdocslib/cloudingestionstrategy.py +++ b/app/backend/prepdocslib/cloudingestionstrategy.py @@ -102,23 +102,33 @@ def __init__( self.indexer_name = f"{prefix}-indexer" self.data_source_name = f"{prefix}-blob" + def _ensure_default_scope(val: str) -> str: + # If already ends with '/.default' keep as-is. + if val.endswith("/.default"): + return val + # If already contains '.default' (rare variant) keep. + if val.endswith(".default"): + return val + # Append '/.default' consistently (works for both raw appId and api://appId forms). + return f"{val}/.default" + self.document_extractor = _SkillConfig( name=f"{prefix}-document-extractor-skill", description="Custom skill that downloads and parses source documents", uri=document_extractor_uri, - auth_resource_id=document_extractor_auth_resource_id, + auth_resource_id=_ensure_default_scope(document_extractor_auth_resource_id), ) self.figure_processor = _SkillConfig( name=f"{prefix}-figure-processor-skill", description="Custom skill that enriches individual figures", uri=figure_processor_uri, - auth_resource_id=figure_processor_auth_resource_id, + auth_resource_id=_ensure_default_scope(figure_processor_auth_resource_id), ) self.text_processor = _SkillConfig( name=f"{prefix}-text-processor-skill", description="Custom skill that merges figures, chunks text, and generates embeddings", uri=text_processor_uri, - auth_resource_id=text_processor_auth_resource_id, + auth_resource_id=_ensure_default_scope(text_processor_auth_resource_id), ) self._search_manager: SearchManager | None = None @@ -131,7 +141,7 @@ def _build_search_manager(self) -> SearchManager: search_info=self.search_info, search_analyzer_name=self.search_analyzer_name, use_acls=self.use_acls, - use_int_vectorization=True, + use_parent_index_projection=True, embeddings=self.embeddings, field_name_embedding=self.search_field_name_embedding, search_images=self.use_multimodal, @@ -222,8 +232,12 @@ def _build_text_processor_skill(self) -> WebApiSkill: ) def _build_skillset(self) -> SearchIndexerSkillset: + # NOTE: Do NOT map the chunk id directly to the index key field. Azure AI Search + # index projections forbid mapping an input field onto the target index key when + # using parent/child projections. The service will generate keys for projected + # child documents automatically. Removing the explicit 'id' mapping resolves + # HttpResponseError: "Input 'id' cannot map to the key field". mappings = [ - InputFieldMappingEntry(name="id", source="/document/chunks/*/id"), InputFieldMappingEntry(name="content", source="/document/chunks/*/content"), InputFieldMappingEntry(name="sourcepage", source="/document/chunks/*/sourcepage"), InputFieldMappingEntry(name="sourcefile", source="/document/chunks/*/sourcefile"), diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py index e7f7e393f7..f996182651 100644 --- a/app/backend/prepdocslib/filestrategy.py +++ b/app/backend/prepdocslib/filestrategy.py @@ -92,7 +92,7 @@ def setup_search_manager(self): self.search_info, self.search_analyzer_name, self.use_acls, - False, + False, # use_parent_index_projection disabled for file-based ingestion self.embeddings, field_name_embedding=self.search_field_name_embedding, search_images=self.image_embeddings is not None, @@ -168,7 +168,7 @@ def __init__( search_info=self.search_info, search_analyzer_name=None, use_acls=True, - use_int_vectorization=False, + use_parent_index_projection=False, embeddings=self.embeddings, field_name_embedding=search_field_name_embedding, search_images=image_embeddings is not None, diff --git a/app/backend/prepdocslib/integratedvectorizerstrategy.py b/app/backend/prepdocslib/integratedvectorizerstrategy.py index 11e826c3e4..c843b9afea 100644 --- a/app/backend/prepdocslib/integratedvectorizerstrategy.py +++ b/app/backend/prepdocslib/integratedvectorizerstrategy.py @@ -134,7 +134,7 @@ async def setup(self): search_info=self.search_info, search_analyzer_name=self.search_analyzer_name, use_acls=self.use_acls, - use_int_vectorization=True, + use_parent_index_projection=True, embeddings=self.embeddings, field_name_embedding=self.search_field_name_embedding, search_images=False, diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py index 0fa6a20d96..a3d75a47d9 100644 --- a/app/backend/prepdocslib/searchmanager.py +++ b/app/backend/prepdocslib/searchmanager.py @@ -69,7 +69,10 @@ def __init__( search_info: SearchInfo, search_analyzer_name: Optional[str] = None, use_acls: bool = False, - use_int_vectorization: bool = False, + # Renamed from use_int_vectorization to use_parent_index_projection to reflect + # that this flag controls parent/child index projection (adding parent_id and + # enhanced key field settings) rather than any specific vectorization mode. + use_parent_index_projection: bool = False, embeddings: Optional[OpenAIEmbeddings] = None, field_name_embedding: Optional[str] = None, search_images: bool = False, @@ -78,7 +81,7 @@ def __init__( self.search_info = search_info self.search_analyzer_name = search_analyzer_name self.use_acls = use_acls - self.use_int_vectorization = use_int_vectorization + self.use_parent_index_projection = use_parent_index_projection self.embeddings = embeddings self.embedding_dimensions = self.embeddings.open_ai_dimensions if self.embeddings else None self.field_name_embedding = field_name_embedding @@ -235,7 +238,7 @@ async def create_index(self): fields = [ ( SimpleField(name="id", type="Edm.String", key=True) - if not self.use_int_vectorization + if not self.use_parent_index_projection else SearchField( name="id", type="Edm.String", @@ -280,8 +283,8 @@ async def create_index(self): else SearchIndexPermissionFilterOption.DISABLED ) - if self.use_int_vectorization: - logger.info("Including parent_id field for integrated vectorization support in new index") + if self.use_parent_index_projection: + logger.info("Including parent_id field for parent/child index projection support in new index") fields.append(SearchableField(name="parent_id", type="Edm.String", filterable=True)) vectorizers: list[VectorSearchVectorizer] = [] diff --git a/app/backend/prepdocslib/servicesetup.py b/app/backend/prepdocslib/servicesetup.py index dbdbe7aaea..b5c1ccd5e3 100644 --- a/app/backend/prepdocslib/servicesetup.py +++ b/app/backend/prepdocslib/servicesetup.py @@ -142,9 +142,10 @@ def setup_image_embeddings_service( def setup_blob_manager( *, + azure_credential: AsyncTokenCredential | str, storage_account: str, storage_container: str, - credential: AsyncTokenCredential | str, + storage_key: str | None = None, storage_resource_group: str | None = None, subscription_id: str | None = None, image_storage_container: str | None = None, @@ -157,11 +158,13 @@ def setup_blob_manager( container when figures are stored separately. """ endpoint = f"https://{storage_account}.blob.core.windows.net" + storage_credential: AsyncTokenCredential | str = azure_credential if storage_key is None else storage_key + return BlobManager( endpoint=endpoint, container=storage_container, account=storage_account, - credential=credential, + credential=storage_credential, resource_group=storage_resource_group, subscription_id=subscription_id, image_container=image_storage_container, diff --git a/infra/app/functions-app.bicep b/infra/app/functions-app.bicep index 7710c27050..13f29e9370 100644 --- a/infra/app/functions-app.bicep +++ b/infra/app/functions-app.bicep @@ -2,6 +2,7 @@ param name string param location string = resourceGroup().location param tags object = {} +@description('Name of an existing Application Insights component. Leave empty to disable.') param applicationInsightsName string param appServicePlanId string param appSettings object = {} @@ -10,12 +11,17 @@ param runtimeVersion string param storageAccountName string param deploymentStorageContainerName string param instanceMemoryMB int = 2048 -param maximumInstanceCount int = 100 +param maximumInstanceCount int = 10 param identityId string param identityClientId string -param functionTimeout string = '00:05:00' +// App Registration client ID (applicationId) used to secure Function App endpoints (Easy Auth) +param skillAppClientId string = '' +// Audience / identifier URI to validate tokens (e.g. api:///-skill) +param skillAppAudience string = '' -var identityType = 'UserAssigned' +// AVM expects authentication.type values: SystemAssignedIdentity | UserAssignedIdentity | StorageAccountConnectionString +// Use UserAssignedIdentity for per-function user-assigned managed identity deployment storage access. +var identityType = 'UserAssignedIdentity' var kind = 'functionapp,linux' var applicationInsightsIdentity = 'ClientId=${identityClientId};Authorization=AAD' @@ -24,11 +30,11 @@ resource stg 'Microsoft.Storage/storageAccounts@2023-05-01' existing = { name: storageAccountName } -resource applicationInsights 'Microsoft.Insights/components@2020-02-02' existing = { +resource applicationInsights 'Microsoft.Insights/components@2020-02-02' existing = if (!empty(applicationInsightsName)) { name: applicationInsightsName } -// Create base application settings +// Create base application settings (independent of Application Insights) var baseAppSettings = { // Storage credentials for AzureWebJobsStorage AzureWebJobsStorage__credential: 'managedidentity' @@ -36,18 +42,23 @@ var baseAppSettings = { AzureWebJobsStorage__blobServiceUri: stg.properties.primaryEndpoints.blob AzureWebJobsStorage__queueServiceUri: stg.properties.primaryEndpoints.queue AzureWebJobsStorage__tableServiceUri: stg.properties.primaryEndpoints.table + FUNCTIONS_EXTENSION_VERSION: '~4' +} - // Application Insights +// Optional Application Insights settings +var appInsightsSettings = !empty(applicationInsightsName) ? { APPLICATIONINSIGHTS_AUTHENTICATION_STRING: applicationInsightsIdentity - APPLICATIONINSIGHTS_CONNECTION_STRING: applicationInsights.properties.ConnectionString + APPLICATIONINSIGHTS_CONNECTION_STRING: applicationInsights.?properties.ConnectionString ?? '' +} : {} - // Function timeout - FUNCTIONS_EXTENSION_VERSION: '~4' - FUNCTIONS_WORKER_RUNTIME: runtimeName -} +// Surface skill application identifiers for downstream logging/diagnostics (not used for manual validation now that Easy Auth is enabled) +var skillAudienceSettings = (!empty(skillAppClientId) && !empty(skillAppAudience)) ? { + SKILL_APP_ID: skillAppClientId + SKILL_APP_AUDIENCE: skillAppAudience +} : {} // Merge all app settings -var allAppSettings = union(appSettings, baseAppSettings) +var allAppSettings = union(appSettings, baseAppSettings, appInsightsSettings, skillAudienceSettings) // Create Flex Consumption Function App using AVM module functionApp 'br/public:avm/res/web/site:0.15.1' = { @@ -84,6 +95,8 @@ module functionApp 'br/public:avm/res/web/site:0.15.1' = { siteConfig: { alwaysOn: false functionAppScaleLimit: maximumInstanceCount + httpsOnly: true + ftpsState: 'Disabled' cors: { allowedOrigins: ['https://portal.azure.com'] } @@ -92,6 +105,33 @@ module functionApp 'br/public:avm/res/web/site:0.15.1' = { } } +// Enable Easy Auth (App Service authentication) for Azure Search custom skill access when a skillAppId is provided. +// Based on Microsoft guidance: require authentication, return 401 on unauthenticated, allowed audience api://{applicationId}. +resource auth 'Microsoft.Web/sites/config@2022-03-01' = if (!empty(skillAppClientId) && !empty(skillAppAudience)) { + name: '${name}/authsettingsV2' + properties: { + globalValidation: { + requireAuthentication: true + unauthenticatedClientAction: 'Return401' + } + identityProviders: { + azureActiveDirectory: { + enabled: true + registration: { + clientId: skillAppClientId + } + validation: { + allowedAudiences: [ skillAppAudience ] + } + } + } + } + dependsOn: [ functionApp ] +} + // Outputs output name string = functionApp.outputs.name output defaultHostname string = functionApp.outputs.defaultHostname +// Expose resourceId for downstream skill auth configuration +output resourceId string = functionApp.outputs.resourceId +output authEnabled bool = !empty(skillAppClientId) && !empty(skillAppAudience) diff --git a/infra/app/functions.bicep b/infra/app/functions.bicep index d8a6f8b06a..75cf78b9f0 100644 --- a/infra/app/functions.bicep +++ b/infra/app/functions.bicep @@ -19,6 +19,8 @@ param contentUnderstandingResourceGroupName string = '' param documentExtractorName string param figureProcessorName string param textProcessorName string +// OpenID issuer provided by main template (e.g. https://login.microsoftonline.com//v2.0) +param openIdIssuer string // Shared configuration param useVectors bool @@ -91,19 +93,17 @@ var figureProcessorDeploymentContainer = 'deploy-figure-processor-${take(resourc var textProcessorDeploymentContainer = 'deploy-text-processor-${take(resourceToken, 7)}' // Create deployment containers in storage account -module deploymentContainers 'br/public:avm/res/storage/storage-account:0.8.3' = { +// Create deployment containers via cross-scope module (avoids re-deploying storage account configuration) +module deploymentContainers 'storage-containers.bicep' = { name: 'function-deployment-containers' scope: resourceGroup(storageResourceGroupName) params: { - name: storageAccountName - location: location - blobServices: { - containers: [ - { name: documentExtractorDeploymentContainer } - { name: figureProcessorDeploymentContainer } - { name: textProcessorDeploymentContainer } - ] - } + storageAccountName: storageAccountName + containerNames: [ + documentExtractorDeploymentContainer + figureProcessorDeploymentContainer + textProcessorDeploymentContainer + ] } } @@ -137,22 +137,63 @@ module figureProcessorIdentity 'br/public:avm/res/managed-identity/user-assigned } } -// App Service Plan (Flex Consumption) -module appServicePlan 'br/public:avm/res/web/serverfarm:0.1.1' = { - name: 'functions-plan' +// Flex Consumption supports only one Function App per plan; create a dedicated plan per ingestion function +module documentExtractorPlan 'br/public:avm/res/web/serverfarm:0.1.1' = { + name: 'doc-extractor-plan' + params: { + name: '${abbrs.webServerFarms}doc-extractor-${resourceToken}' + sku: { + name: 'FC1' + tier: 'FlexConsumption' + } + reserved: true + location: location + tags: tags + } +} + +module figureProcessorPlan 'br/public:avm/res/web/serverfarm:0.1.1' = { + name: 'figure-processor-plan' params: { - name: '${abbrs.webServerFarms}functions-${resourceToken}' + name: '${abbrs.webServerFarms}figure-processor-${resourceToken}' sku: { name: 'FC1' tier: 'FlexConsumption' } - reserved: true // Required for Linux + reserved: true + location: location + tags: tags + } +} + +module textProcessorPlan 'br/public:avm/res/web/serverfarm:0.1.1' = { + name: 'text-processor-plan' + params: { + name: '${abbrs.webServerFarms}text-processor-${resourceToken}' + sku: { + name: 'FC1' + tier: 'FlexConsumption' + } + reserved: true location: location tags: tags } } // Document Extractor Function App +// App registration for document extractor (uses function identity principalId as FIC subject) +module documentExtractorAppReg '../core/auth/appregistration.bicep' = { + name: 'doc-extractor-appreg' + params: { + cloudEnvironment: environment().name + webAppIdentityId: documentExtractorIdentity.outputs.principalId + clientAppName: 'skill-${documentExtractorName}' + clientAppDisplayName: 'skill-${documentExtractorName}' + issuer: openIdIssuer + webAppEndpoint: 'https://${documentExtractorName}.azurewebsites.net' + } +} + module documentExtractor 'functions-app.bicep' = { name: 'document-extractor-func' params: { @@ -160,7 +201,7 @@ module documentExtractor 'functions-app.bicep' = { location: location tags: union(tags, { 'azd-service-name': 'document-extractor' }) applicationInsightsName: applicationInsightsName - appServicePlanId: appServicePlan.outputs.resourceId + appServicePlanId: documentExtractorPlan.outputs.resourceId runtimeName: 'python' runtimeVersion: '3.11' storageAccountName: storageAccountName @@ -170,7 +211,9 @@ module documentExtractor 'functions-app.bicep' = { appSettings: allAppSettings instanceMemoryMB: 4096 // High memory for document processing maximumInstanceCount: 100 - functionTimeout: '00:10:00' // 10 minutes for long-running extraction + // Removed unused functionTimeout parameter; configured defaults via host settings + skillAppClientId: documentExtractorAppReg.outputs.clientAppId + skillAppAudience: 'api://${documentExtractorAppReg.outputs.clientAppId}' } dependsOn: [ deploymentContainers @@ -178,6 +221,18 @@ module documentExtractor 'functions-app.bicep' = { } // Figure Processor Function App +module figureProcessorAppReg '../core/auth/appregistration.bicep' = { + name: 'figure-processor-appreg' + params: { + cloudEnvironment: environment().name + webAppIdentityId: figureProcessorIdentity.outputs.principalId + clientAppName: 'skill-${figureProcessorName}' + clientAppDisplayName: 'skill-${figureProcessorName}' + issuer: openIdIssuer + webAppEndpoint: 'https://${figureProcessorName}.azurewebsites.net' + } +} + module figureProcessor 'functions-app.bicep' = { name: 'figure-processor-func' params: { @@ -185,7 +240,7 @@ module figureProcessor 'functions-app.bicep' = { location: location tags: union(tags, { 'azd-service-name': 'figure-processor' }) applicationInsightsName: applicationInsightsName - appServicePlanId: appServicePlan.outputs.resourceId + appServicePlanId: figureProcessorPlan.outputs.resourceId runtimeName: 'python' runtimeVersion: '3.11' storageAccountName: storageAccountName @@ -195,7 +250,8 @@ module figureProcessor 'functions-app.bicep' = { appSettings: allAppSettings instanceMemoryMB: 2048 maximumInstanceCount: 100 - functionTimeout: '00:05:00' + skillAppClientId: figureProcessorAppReg.outputs.clientAppId + skillAppAudience: 'api://${figureProcessorAppReg.outputs.clientAppId}' } dependsOn: [ deploymentContainers @@ -203,6 +259,18 @@ module figureProcessor 'functions-app.bicep' = { } // Text Processor Function App +module textProcessorAppReg '../core/auth/appregistration.bicep' = { + name: 'text-processor-appreg' + params: { + cloudEnvironment: environment().name + webAppIdentityId: textProcessorIdentity.outputs.principalId + clientAppName: 'skill-${textProcessorName}' + clientAppDisplayName: 'skill-${textProcessorName}' + issuer: openIdIssuer + webAppEndpoint: 'https://${textProcessorName}.azurewebsites.net' + } +} + module textProcessor 'functions-app.bicep' = { name: 'text-processor-func' params: { @@ -210,7 +278,7 @@ module textProcessor 'functions-app.bicep' = { location: location tags: union(tags, { 'azd-service-name': 'text-processor' }) applicationInsightsName: applicationInsightsName - appServicePlanId: appServicePlan.outputs.resourceId + appServicePlanId: textProcessorPlan.outputs.resourceId runtimeName: 'python' runtimeVersion: '3.11' storageAccountName: storageAccountName @@ -220,7 +288,8 @@ module textProcessor 'functions-app.bicep' = { appSettings: allAppSettings instanceMemoryMB: 2048 // Standard memory for embedding maximumInstanceCount: 100 - functionTimeout: '00:05:00' // 5 minutes default + skillAppClientId: textProcessorAppReg.outputs.clientAppId + skillAppAudience: 'api://${textProcessorAppReg.outputs.clientAppId}' } dependsOn: [ deploymentContainers @@ -282,10 +351,21 @@ module figureProcessorRbac 'functions-rbac.bicep' = { output documentExtractorName string = documentExtractor.outputs.name output documentExtractorUrl string = documentExtractor.outputs.defaultHostname output documentExtractorIdentityPrincipalId string = documentExtractorIdentity.outputs.principalId +output documentExtractorClientAppId string = documentExtractorAppReg.outputs.clientAppId +output documentExtractorSkillResourceId string = 'api://${documentExtractorAppReg.outputs.clientAppId}' : documentExtractorAppReg.outputs.clientAppId output figureProcessorName string = figureProcessor.outputs.name output figureProcessorUrl string = figureProcessor.outputs.defaultHostname output figureProcessorIdentityPrincipalId string = figureProcessorIdentity.outputs.principalId +output figureProcessorClientAppId string = figureProcessorAppReg.outputs.clientAppId +output figureProcessorSkillResourceId string = 'api://${figureProcessorAppReg.outputs.clientAppId}' output textProcessorName string = textProcessor.outputs.name output textProcessorUrl string = textProcessor.outputs.defaultHostname output textProcessorIdentityPrincipalId string = textProcessorIdentity.outputs.principalId -output appServicePlanId string = appServicePlan.outputs.resourceId +output textProcessorClientAppId string = textProcessorAppReg.outputs.clientAppId +output textProcessorSkillResourceId string = 'api://${textProcessorAppReg.outputs.clientAppId}' +// Output the last plan id (text processor) for potential diagnostics; others can be added if needed +output appServicePlanId string = textProcessorPlan.outputs.resourceId +// Resource IDs for each function app (used for auth_resource_id with managed identity secured skills) +output documentExtractorResourceId string = documentExtractor.outputs.resourceId +output figureProcessorResourceId string = figureProcessor.outputs.resourceId +output textProcessorResourceId string = textProcessor.outputs.resourceId diff --git a/infra/app/storage-containers.bicep b/infra/app/storage-containers.bicep new file mode 100644 index 0000000000..bc3a45c13c --- /dev/null +++ b/infra/app/storage-containers.bicep @@ -0,0 +1,24 @@ +targetScope = 'resourceGroup' + +@description('Name of existing storage account to add deployment containers to') +param storageAccountName string +@description('List of container names to ensure exist') +param containerNames array + +// Existing storage account +resource stg 'Microsoft.Storage/storageAccounts@2023-05-01' existing = { + name: storageAccountName +} + +// Existing blob service +resource blob 'Microsoft.Storage/storageAccounts/blobServices@2023-05-01' existing = { + name: 'default' + parent: stg +} + +// Create each container (no public access, default properties) +resource containers 'Microsoft.Storage/storageAccounts/blobServices/containers@2023-05-01' = [for c in containerNames: { + name: c + parent: blob + properties: {} +}] diff --git a/infra/bicepconfig.json b/infra/bicepconfig.json new file mode 100644 index 0000000000..cd15f3f32a --- /dev/null +++ b/infra/bicepconfig.json @@ -0,0 +1,5 @@ +{ + "extensions": { + "microsoftGraphV1": "br:mcr.microsoft.com/bicep/extensions/microsoftgraph/v1.0:1.0.0" + } +} diff --git a/infra/core/auth/appregistration.bicep b/infra/core/auth/appregistration.bicep new file mode 100644 index 0000000000..3bf29ce701 --- /dev/null +++ b/infra/core/auth/appregistration.bicep @@ -0,0 +1,91 @@ +extension microsoftGraphV1 + +@description('Specifies the name of cloud environment to run this deployment in.') +param cloudEnvironment string = environment().name + +// NOTE: Microsoft Graph Bicep file deployment is only supported in Public Cloud +@description('Audience uris for public and national clouds') +param audiences object = { + AzureCloud: { + uri: 'api://AzureADTokenExchange' + } + AzureUSGovernment: { + uri: 'api://AzureADTokenExchangeUSGov' + } + USNat: { + uri: 'api://AzureADTokenExchangeUSNat' + } + USSec: { + uri: 'api://AzureADTokenExchangeUSSec' + } + AzureChinaCloud: { + uri: 'api://AzureADTokenExchangeChina' + } +} + +@description('Specifies the ID of the user-assigned managed identity.') +param webAppIdentityId string + +@description('Specifies the unique name for the client application.') +param clientAppName string + +@description('Specifies the display name for the client application') +param clientAppDisplayName string + +@description('Specifies the scopes that the client application requires.') +param clientAppScopes array = ['User.Read', 'offline_access', 'openid', 'profile'] + +param serviceManagementReference string = '' + +param issuer string + +param webAppEndpoint string + +// Get the MS Graph Service Principal based on its application ID: +// https://learn.microsoft.com/troubleshoot/entra/entra-id/governance/verify-first-party-apps-sign-in +var msGraphAppId = '00000003-0000-0000-c000-000000000000' +resource msGraphSP 'Microsoft.Graph/servicePrincipals@v1.0' existing = { + appId: msGraphAppId +} + +var graphScopes = msGraphSP.oauth2PermissionScopes +resource clientApp 'Microsoft.Graph/applications@v1.0' = { + uniqueName: clientAppName + displayName: clientAppDisplayName + signInAudience: 'AzureADMyOrg' + serviceManagementReference: empty(serviceManagementReference) ? null : serviceManagementReference + web: { + redirectUris: [ + 'http://localhost:50505/.auth/login/aad/callback' + '${webAppEndpoint}/.auth/login/aad/callback' + ] + implicitGrantSettings: { enableIdTokenIssuance: true } + } + requiredResourceAccess: [ + { + resourceAppId: msGraphAppId + resourceAccess: [ + for (scope, i) in clientAppScopes: { + id: filter(graphScopes, graphScopes => graphScopes.value == scope)[0].id + type: 'Scope' + } + ] + } + ] + + resource clientAppFic 'federatedIdentityCredentials@v1.0' = { + name: '${clientApp.uniqueName}/miAsFic' + audiences: [ + audiences[cloudEnvironment].uri + ] + issuer: issuer + subject: webAppIdentityId + } +} + +resource clientSp 'Microsoft.Graph/servicePrincipals@v1.0' = { + appId: clientApp.appId +} + +output clientAppId string = clientApp.appId +output clientSpId string = clientSp.id diff --git a/infra/core/auth/appupdate.bicep b/infra/core/auth/appupdate.bicep new file mode 100644 index 0000000000..74c2e87b1b --- /dev/null +++ b/infra/core/auth/appupdate.bicep @@ -0,0 +1,42 @@ +param appServiceName string + +@description('The client ID of the Microsoft Entra application.') +param clientId string + +param openIdIssuer string + +resource appService 'Microsoft.Web/sites@2022-03-01' existing = { + name: appServiceName +} + +resource configAuth 'Microsoft.Web/sites/config@2022-03-01' = { + parent: appService + name: 'authsettingsV2' + properties: { + globalValidation: { + requireAuthentication: true + unauthenticatedClientAction: 'RedirectToLoginPage' + redirectToProvider: 'azureactivedirectory' + } + identityProviders: { + azureActiveDirectory: { + enabled: true + registration: { + clientId: clientId + clientSecretSettingName: 'OVERRIDE_USE_MI_FIC_ASSERTION_CLIENTID' + openIdIssuer: openIdIssuer + } + validation: { + defaultAuthorizationPolicy: { + allowedApplications: [] + } + } + } + } + login: { + tokenStore: { + enabled: true + } + } + } +} diff --git a/infra/main.bicep b/infra/main.bicep index a1dcfce1ce..1a50e42596 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -469,7 +469,7 @@ var appEnvVariables = { AZURE_SEARCH_SERVICE: searchService.outputs.name AZURE_SEARCH_SEMANTIC_RANKER: actualSearchServiceSemanticRankerLevel AZURE_SEARCH_QUERY_REWRITING: searchServiceQueryRewriting - AZURE_VISION_ENDPOINT: useMultimodal ? vision.outputs.endpoint : '' + AZURE_VISION_ENDPOINT: useMultimodal ? vision!.outputs.endpoint : '' AZURE_SEARCH_QUERY_LANGUAGE: searchQueryLanguage AZURE_SEARCH_QUERY_SPELLER: searchQuerySpeller AZURE_SEARCH_FIELD_NAME_EMBEDDING: searchFieldNameEmbedding @@ -538,6 +538,14 @@ var appEnvVariables = { RAG_SEARCH_IMAGE_EMBEDDINGS: ragSearchImageEmbeddings RAG_SEND_TEXT_SOURCES: ragSendTextSources RAG_SEND_IMAGE_SOURCES: ragSendImageSources + // Cloud ingestion skill endpoints (populated when useCloudIngestion) + DOCUMENT_EXTRACTOR_SKILL_ENDPOINT: useCloudIngestion ? 'https://${functions!.outputs.documentExtractorUrl}/api/extract' : '' + FIGURE_PROCESSOR_SKILL_ENDPOINT: useCloudIngestion ? 'https://${functions!.outputs.figureProcessorUrl}/api/process' : '' + TEXT_PROCESSOR_SKILL_ENDPOINT: useCloudIngestion ? 'https://${functions!.outputs.textProcessorUrl}/api/process' : '' + // Skill audience identifier URI from registration module (created below) + DOCUMENT_EXTRACTOR_SKILL_RESOURCE_ID: useCloudIngestion ? functions!.outputs.documentExtractorSkillResourceId : '' + FIGURE_PROCESSOR_SKILL_RESOURCE_ID: useCloudIngestion ? functions!.outputs.figureProcessorSkillResourceId : '' + TEXT_PROCESSOR_SKILL_RESOURCE_ID: useCloudIngestion ? functions!.outputs.textProcessorSkillResourceId : '' } // App Service for the web application (Python Quart app with JS frontend) @@ -665,18 +673,18 @@ module functions 'app/functions.bicep' = if (useCloudIngestion) { params: { location: location tags: tags - applicationInsightsName: useApplicationInsights ? monitoring.outputs.applicationInsightsName : '' + applicationInsightsName: useApplicationInsights ? monitoring!.outputs.applicationInsightsName : '' storageAccountName: storage.outputs.name storageResourceGroupName: storageResourceGroup.name searchServiceName: searchService.outputs.name searchServiceResourceGroupName: searchServiceResourceGroup.name - openAiServiceName: isAzureOpenAiHost ? openAi.outputs.name : '' + openAiServiceName: isAzureOpenAiHost ? openAi!.outputs.name : '' openAiResourceGroupName: openAiResourceGroup.name documentIntelligenceServiceName: documentIntelligence.outputs.name documentIntelligenceResourceGroupName: documentIntelligenceResourceGroup.name - visionServiceName: useMultimodal ? vision.outputs.name : '' + visionServiceName: useMultimodal ? vision!.outputs.name : '' visionResourceGroupName: useMultimodal ? visionResourceGroup.name : resourceGroup.name - contentUnderstandingServiceName: useMediaDescriberAzureCU ? contentUnderstanding.outputs.name : '' + contentUnderstandingServiceName: useMediaDescriberAzureCU ? contentUnderstanding!.outputs.name : '' contentUnderstandingResourceGroupName: useMediaDescriberAzureCU ? contentUnderstandingResourceGroup.name : resourceGroup.name documentExtractorName: '${abbrs.webSitesFunctions}doc-extractor-${resourceToken}' figureProcessorName: '${abbrs.webSitesFunctions}figure-processor-${resourceToken}' @@ -695,6 +703,7 @@ module functions 'app/functions.bicep' = if (useCloudIngestion) { openAiChatDeployment: chatGpt.deploymentName openAiChatModelName: chatGpt.modelName openAiCustomUrl: azureOpenAiCustomUrl + openIdIssuer: authenticationIssuerUri } } @@ -1208,7 +1217,8 @@ module storageOwnerRoleBackend 'core/security/role.bicep' = if (useUserUpload) { } } -module storageRoleSearchService 'core/security/role.bicep' = if (useIntegratedVectorization) { +// Search service needs blob read access for both integrated vectorization and cloud ingestion indexer data source +module storageRoleSearchService 'core/security/role.bicep' = if (useIntegratedVectorization || useCloudIngestion) { scope: storageResourceGroup name: 'storage-role-searchservice' params: { @@ -1462,11 +1472,11 @@ output AZURE_OPENAI_EVAL_MODEL string = isAzureOpenAiHost && useEval ? eval.mode output AZURE_OPENAI_SEARCHAGENT_DEPLOYMENT string = isAzureOpenAiHost && useAgenticRetrieval ? searchAgent.deploymentName : '' output AZURE_OPENAI_SEARCHAGENT_MODEL string = isAzureOpenAiHost && useAgenticRetrieval ? searchAgent.modelName : '' output AZURE_OPENAI_REASONING_EFFORT string = defaultReasoningEffort -output AZURE_SPEECH_SERVICE_ID string = useSpeechOutputAzure ? speech.outputs.resourceId : '' -output AZURE_SPEECH_SERVICE_LOCATION string = useSpeechOutputAzure ? speech.outputs.location : '' +output AZURE_SPEECH_SERVICE_ID string = useSpeechOutputAzure ? speech!.outputs.resourceId : '' +output AZURE_SPEECH_SERVICE_LOCATION string = useSpeechOutputAzure ? speech!.outputs.location : '' -output AZURE_VISION_ENDPOINT string = useMultimodal ? vision.outputs.endpoint : '' -output AZURE_CONTENTUNDERSTANDING_ENDPOINT string = useMediaDescriberAzureCU ? contentUnderstanding.outputs.endpoint : '' +output AZURE_VISION_ENDPOINT string = useMultimodal ? vision!.outputs.endpoint : '' +output AZURE_CONTENTUNDERSTANDING_ENDPOINT string = useMediaDescriberAzureCU ? contentUnderstanding!.outputs.endpoint : '' output AZURE_DOCUMENTINTELLIGENCE_SERVICE string = documentIntelligence.outputs.name output AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP string = documentIntelligenceResourceGroup.name @@ -1494,6 +1504,15 @@ output AZURE_USERSTORAGE_RESOURCE_GROUP string = storageResourceGroup.name output AZURE_IMAGESTORAGE_CONTAINER string = useMultimodal ? imageStorageContainerName : '' +// Cloud ingestion function skill endpoints & resource IDs +output DOCUMENT_EXTRACTOR_SKILL_ENDPOINT string = useCloudIngestion ? 'https://${functions!.outputs.documentExtractorUrl}/api/extract' : '' +output FIGURE_PROCESSOR_SKILL_ENDPOINT string = useCloudIngestion ? 'https://${functions!.outputs.figureProcessorUrl}/api/process' : '' +output TEXT_PROCESSOR_SKILL_ENDPOINT string = useCloudIngestion ? 'https://${functions!.outputs.textProcessorUrl}/api/process' : '' +// Identifier URI used as authResourceId for all custom skill endpoints +output DOCUMENT_EXTRACTOR_SKILL_RESOURCE_ID string = useCloudIngestion ? functions!.outputs.documentExtractorSkillResourceId : '' +output FIGURE_PROCESSOR_SKILL_RESOURCE_ID string = useCloudIngestion ? functions!.outputs.figureProcessorSkillResourceId : '' +output TEXT_PROCESSOR_SKILL_RESOURCE_ID string = useCloudIngestion ? functions!.outputs.textProcessorSkillResourceId : '' + output AZURE_AI_PROJECT string = useAiProject ? ai.outputs.projectName : '' output AZURE_USE_AUTHENTICATION bool = useAuthentication diff --git a/tests/test_searchmanager.py b/tests/test_searchmanager.py index 6cf7b6d10a..1616141a60 100644 --- a/tests/test_searchmanager.py +++ b/tests/test_searchmanager.py @@ -53,7 +53,7 @@ async def mock_list_index_names(self): monkeypatch.setattr(SearchIndexClient, "create_index", mock_create_index) monkeypatch.setattr(SearchIndexClient, "list_index_names", mock_list_index_names) - manager = SearchManager(search_info, use_int_vectorization=False, field_name_embedding="embedding") + manager = SearchManager(search_info, use_parent_index_projection=False, field_name_embedding="embedding") await manager.create_index() assert len(indexes) == 1, "It should have created one index" assert indexes[0].name == "test" @@ -76,7 +76,7 @@ async def mock_list_index_names(self): manager = SearchManager( search_info, - use_int_vectorization=True, + use_parent_index_projection=True, field_name_embedding="embedding", ) await manager.create_index() @@ -634,6 +634,7 @@ async def mock_upload_documents(self, documents): description="Test image", figure_id="fig1", page_num=0, + placeholder="
", # required positional arg url="http://example.com/img1.png", embedding=[0.01, 0.02], ) From df0c17a17a4cd8f3b4fc0519252ac1785378876c Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 3 Nov 2025 21:19:28 -0800 Subject: [PATCH 03/30] chore(functions): add missing prepdocslib dependencies to function requirements --- .../document_extractor/requirements.txt | 21 ++++++++++++------- .../figure_processor/requirements.txt | 19 +++++++++++------ app/functions/text_processor/requirements.txt | 21 ++++++++++++------- 3 files changed, 39 insertions(+), 22 deletions(-) diff --git a/app/functions/document_extractor/requirements.txt b/app/functions/document_extractor/requirements.txt index fd693a5f09..49ab46bcaa 100644 --- a/app/functions/document_extractor/requirements.txt +++ b/app/functions/document_extractor/requirements.txt @@ -1,9 +1,14 @@ -# Azure Functions runtime azure-functions>=1.21.3,<2.0.0 - -# Local dependency: prepdocslib -../../backend/prepdocslib - -# Note: prepdocslib's dependencies will be pulled in automatically -# including: azure-ai-documentintelligence, azure-storage-blob, -# azure-identity, openai, pymupdf, beautifulsoup4, etc. +azure-identity +azure-core +azure-storage-blob +azure-storage-file-datalake +azure-search-documents==11.7.0b1 +azure-ai-documentintelligence==1.0.0b4 +typing-extensions +beautifulsoup4 +pillow +PyMuPDF +pypdf +tenacity +rich diff --git a/app/functions/figure_processor/requirements.txt b/app/functions/figure_processor/requirements.txt index ca9be31e1c..49ab46bcaa 100644 --- a/app/functions/figure_processor/requirements.txt +++ b/app/functions/figure_processor/requirements.txt @@ -1,7 +1,14 @@ -# Azure Functions runtime azure-functions>=1.21.3,<2.0.0 - -# Local dependency: prepdocslib -../../backend/prepdocslib - -# prepdocslib brings in azure-identity, azure-storage-blob, openai, etc. +azure-identity +azure-core +azure-storage-blob +azure-storage-file-datalake +azure-search-documents==11.7.0b1 +azure-ai-documentintelligence==1.0.0b4 +typing-extensions +beautifulsoup4 +pillow +PyMuPDF +pypdf +tenacity +rich diff --git a/app/functions/text_processor/requirements.txt b/app/functions/text_processor/requirements.txt index 4806076325..49ab46bcaa 100644 --- a/app/functions/text_processor/requirements.txt +++ b/app/functions/text_processor/requirements.txt @@ -1,9 +1,14 @@ -# Azure Functions runtime azure-functions>=1.21.3,<2.0.0 - -# Local dependency: prepdocslib -../../backend/prepdocslib - -# Note: prepdocslib's dependencies will be pulled in automatically -# including: azure-search-documents, azure-storage-blob, -# azure-identity, openai, tiktoken, etc. +azure-identity +azure-core +azure-storage-blob +azure-storage-file-datalake +azure-search-documents==11.7.0b1 +azure-ai-documentintelligence==1.0.0b4 +typing-extensions +beautifulsoup4 +pillow +PyMuPDF +pypdf +tenacity +rich From e805ee3bf9ce516a183bdcfe49646b80ee926f04 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 3 Nov 2025 21:31:56 -0800 Subject: [PATCH 04/30] build(functions): vendor dependencies into .python_packages for flex consumption --- azure.yaml | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/azure.yaml b/azure.yaml index 7de8ec5c3a..46f0167363 100644 --- a/azure.yaml +++ b/azure.yaml @@ -44,14 +44,50 @@ services: project: ./app/functions/document_extractor language: py host: function + hooks: + prepackage: + windows: + shell: pwsh + run: python ../../../scripts/copy_prepdocslib.py;python -m pip install --upgrade pip;python -m pip install -r requirements.txt -t .python_packages/lib/site-packages + interactive: false + continueOnError: false + posix: + shell: sh + run: python ../../../scripts/copy_prepdocslib.py && python -m pip install --upgrade pip && python -m pip install -r requirements.txt -t .python_packages/lib/site-packages + interactive: false + continueOnError: false figure-processor: project: ./app/functions/figure_processor language: py host: function + hooks: + prepackage: + windows: + shell: pwsh + run: python ../../../scripts/copy_prepdocslib.py;python -m pip install --upgrade pip;python -m pip install -r requirements.txt -t .python_packages/lib/site-packages + interactive: false + continueOnError: false + posix: + shell: sh + run: python ../../../scripts/copy_prepdocslib.py && python -m pip install --upgrade pip && python -m pip install -r requirements.txt -t .python_packages/lib/site-packages + interactive: false + continueOnError: false text-processor: project: ./app/functions/text_processor language: py host: function + hooks: + prepackage: + windows: + shell: pwsh + run: python ../../../scripts/copy_prepdocslib.py;python -m pip install --upgrade pip;python -m pip install -r requirements.txt -t .python_packages/lib/site-packages + interactive: false + continueOnError: false + posix: + shell: sh + run: python ../../../scripts/copy_prepdocslib.py && python -m pip install --upgrade pip && python -m pip install -r requirements.txt -t .python_packages/lib/site-packages + interactive: false + continueOnError: false hooks: preprovision: windows: From 253cb7eea1fb4b64f0d2a2ca44d94de264786256 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 3 Nov 2025 21:37:30 -0800 Subject: [PATCH 05/30] chore(functions): copy backend requirements as requirements.backend.txt for traceability --- scripts/copy_prepdocslib.py | 54 +++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 scripts/copy_prepdocslib.py diff --git a/scripts/copy_prepdocslib.py b/scripts/copy_prepdocslib.py new file mode 100644 index 0000000000..d136fc67e2 --- /dev/null +++ b/scripts/copy_prepdocslib.py @@ -0,0 +1,54 @@ +"""Synchronize shared ingestion library and backend requirements with each +Azure Function project prior to packaging. + +What this script does: +1. Copies the `prepdocslib` directory into every function service directory so + that relative imports succeed at build and runtime. +2. Copies the backend `requirements.txt` alongside the function code as + `requirements.backend.txt` for traceability and potential future merges. + +Why we don't overwrite the function's own `requirements.txt`: +Each function has a minimal dependency list to reduce cold start time. The +backend dependency set is larger (includes web framework, tracing, etc.) and +is preserved separately should we later decide to consolidate pins. +""" + +from __future__ import annotations + +import shutil +from pathlib import Path + + +def copy_tree(src: Path, dest: Path) -> None: + if dest.exists(): + shutil.rmtree(dest) + shutil.copytree(src, dest) + + +def main() -> None: + repo_root = Path(__file__).resolve().parent.parent + prep_source = repo_root / "app" / "backend" / "prepdocslib" + if not prep_source.exists(): + raise RuntimeError(f"Source prepdocslib directory not found: {prep_source}") + + backend_requirements = repo_root / "app" / "backend" / "requirements.txt" + if not backend_requirements.exists(): + raise RuntimeError(f"Backend requirements file not found: {backend_requirements}") + + targets = [ + repo_root / "app" / "functions" / "document_extractor" / "prepdocslib", + repo_root / "app" / "functions" / "figure_processor" / "prepdocslib", + repo_root / "app" / "functions" / "text_processor" / "prepdocslib", + ] + + for target in targets: + target.parent.mkdir(parents=True, exist_ok=True) + # Copy library tree + copy_tree(prep_source, target) + # Copy backend requirements next to the function-specific one for reference + dest_req = target.parent / "requirements.backend.txt" + shutil.copy2(backend_requirements, dest_req) + + +if __name__ == "__main__": + main() From d66a6201e7a55a98dbba129c9d957365902cdb04 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 3 Nov 2025 21:39:21 -0800 Subject: [PATCH 06/30] chore(functions): overwrite function requirements with backend pins (backup original) --- scripts/copy_prepdocslib.py | 49 ++++++++++++++++++++++++------------- 1 file changed, 32 insertions(+), 17 deletions(-) diff --git a/scripts/copy_prepdocslib.py b/scripts/copy_prepdocslib.py index d136fc67e2..05731d9e4d 100644 --- a/scripts/copy_prepdocslib.py +++ b/scripts/copy_prepdocslib.py @@ -1,16 +1,17 @@ -"""Synchronize shared ingestion library and backend requirements with each -Azure Function project prior to packaging. - -What this script does: -1. Copies the `prepdocslib` directory into every function service directory so - that relative imports succeed at build and runtime. -2. Copies the backend `requirements.txt` alongside the function code as - `requirements.backend.txt` for traceability and potential future merges. - -Why we don't overwrite the function's own `requirements.txt`: -Each function has a minimal dependency list to reduce cold start time. The -backend dependency set is larger (includes web framework, tracing, etc.) and -is preserved separately should we later decide to consolidate pins. +"""Synchronize shared ingestion library and unify dependencies across all +Azure Function projects prior to packaging. + +Actions: +1. Copy `prepdocslib` into each function directory. +2. Overwrite each function's `requirements.txt` with the backend + `requirements.txt` (full set of pins) for consistent dependency versions. +3. Preserve the original function `requirements.txt` (if it existed) as + `requirements.functions.txt` for rollback/reference. +4. Also copy the backend requirements as `requirements.backend.txt` for audit. + +Note: Using the full backend dependency set will increase package size and may +slightly impact cold start, but ensures all transitive imports (e.g. azure.core) +are available without manual curation. """ from __future__ import annotations @@ -42,11 +43,25 @@ def main() -> None: ] for target in targets: - target.parent.mkdir(parents=True, exist_ok=True) - # Copy library tree + func_dir = target.parent + func_dir.mkdir(parents=True, exist_ok=True) + + # 1. Library sync copy_tree(prep_source, target) - # Copy backend requirements next to the function-specific one for reference - dest_req = target.parent / "requirements.backend.txt" + + # 2. Preserve original requirements if present + original_req = func_dir / "requirements.txt" + if original_req.exists(): + backup_req = func_dir / "requirements.functions.txt" + # Only backup if we haven't already + if not backup_req.exists(): + shutil.copy2(original_req, backup_req) + + # 3. Overwrite with backend requirements + shutil.copy2(backend_requirements, original_req) + + # 4. Copy backend requirements for explicit provenance + dest_req = func_dir / "requirements.backend.txt" shutil.copy2(backend_requirements, dest_req) From 0d7e8a9e0f038a7cd59b9a2918a003dc10c2b524 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 3 Nov 2025 21:40:51 -0800 Subject: [PATCH 07/30] chore(functions): remove requirements backup; always overwrite with backend pins --- scripts/copy_prepdocslib.py | 39 +++++++++++++------------------------ 1 file changed, 14 insertions(+), 25 deletions(-) diff --git a/scripts/copy_prepdocslib.py b/scripts/copy_prepdocslib.py index 05731d9e4d..540e97987c 100644 --- a/scripts/copy_prepdocslib.py +++ b/scripts/copy_prepdocslib.py @@ -1,17 +1,13 @@ -"""Synchronize shared ingestion library and unify dependencies across all -Azure Function projects prior to packaging. +"""Synchronize ingestion library and apply unified dependency pins. -Actions: +Steps: 1. Copy `prepdocslib` into each function directory. -2. Overwrite each function's `requirements.txt` with the backend - `requirements.txt` (full set of pins) for consistent dependency versions. -3. Preserve the original function `requirements.txt` (if it existed) as - `requirements.functions.txt` for rollback/reference. -4. Also copy the backend requirements as `requirements.backend.txt` for audit. - -Note: Using the full backend dependency set will increase package size and may -slightly impact cold start, but ensures all transitive imports (e.g. azure.core) -are available without manual curation. +2. Overwrite each function's `requirements.txt` with backend `requirements.txt`. +3. Copy backend requirements again as `requirements.backend.txt` for audit. + +No backups retained (per user request). The previous minimal requirements are +discarded. All functions now share identical pinned versions ensuring imports +like `azure.core` are available. """ from __future__ import annotations @@ -49,20 +45,13 @@ def main() -> None: # 1. Library sync copy_tree(prep_source, target) - # 2. Preserve original requirements if present - original_req = func_dir / "requirements.txt" - if original_req.exists(): - backup_req = func_dir / "requirements.functions.txt" - # Only backup if we haven't already - if not backup_req.exists(): - shutil.copy2(original_req, backup_req) - - # 3. Overwrite with backend requirements - shutil.copy2(backend_requirements, original_req) + # 2. Overwrite requirements.txt directly + overwrite_req = func_dir / "requirements.txt" + shutil.copy2(backend_requirements, overwrite_req) - # 4. Copy backend requirements for explicit provenance - dest_req = func_dir / "requirements.backend.txt" - shutil.copy2(backend_requirements, dest_req) + # 3. Copy backend requirements for explicit provenance + audit_req = func_dir / "requirements.backend.txt" + shutil.copy2(backend_requirements, audit_req) if __name__ == "__main__": From 12d71d56591996a3ea25d33929fc07c812fdee1f Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 3 Nov 2025 23:01:30 -0800 Subject: [PATCH 08/30] Get function apps deployed --- .gitignore | 2 + AGENTS.md | 1 + app/backend/requirements.in | 1 + app/backend/requirements.txt | 5 +- .../document_extractor/requirements.txt | 473 +++++++++++++++++- .../figure_processor/function_app.py | 4 +- .../figure_processor/requirements.txt | 473 +++++++++++++++++- app/functions/text_processor/requirements.txt | 473 +++++++++++++++++- azure.yaml | 12 +- infra/app/functions.bicep | 161 +++++- scripts/copy_prepdocslib.py | 9 - 11 files changed, 1535 insertions(+), 79 deletions(-) diff --git a/.gitignore b/.gitignore index 05bbf3b060..0102334ea2 100644 --- a/.gitignore +++ b/.gitignore @@ -148,6 +148,8 @@ npm-debug.log* node_modules static/ +app/functions/*/prepdocslib/ + data/**/*.md5 .DS_Store diff --git a/AGENTS.md b/AGENTS.md index 6dc856ce38..a4c85529cb 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -21,6 +21,7 @@ If necessary, edit this file to ensure it accurately reflects the current state * app/backend/prepdocslib/pdfparser.py: Uses Azure Document Intelligence to emit page text plus figure placeholders * app/backend/prepdocslib/figureprocessor.py: Shared helper that generates figure descriptions for both local ingestion and the cloud figure-processor skill * app/backend/app.py: The main entry point for the backend application. + * app/functions: Azure Functions used for cloud ingestion custom skills (document extraction, figure processing, text processing). Each function bundles a synchronized copy of `prepdocslib`; run `python scripts/copy_prepdocslib.py` to refresh the local copies if you modify the library. * app/frontend: Contains the React frontend code, built with TypeScript, built with vite. * app/frontend/src/api: Contains the API client code for communicating with the backend. * app/frontend/src/components: Contains the React components for the frontend. diff --git a/app/backend/requirements.in b/app/backend/requirements.in index 1110ef5546..8467265018 100644 --- a/app/backend/requirements.in +++ b/app/backend/requirements.in @@ -1,3 +1,4 @@ +azure-functions>=1.24.0 azure-identity quart quart-cors diff --git a/app/backend/requirements.txt b/app/backend/requirements.txt index 36475a326b..8cd04cf565 100644 --- a/app/backend/requirements.txt +++ b/app/backend/requirements.txt @@ -47,6 +47,8 @@ azure-core-tracing-opentelemetry==1.0.0b11 # via azure-monitor-opentelemetry azure-cosmos==4.9.0 # via -r requirements.in +azure-functions==1.24.0 + # via -r requirements.in azure-identity==1.17.1 # via # -r requirements.in @@ -439,8 +441,9 @@ urllib3==2.5.0 # via requests uvicorn==0.30.6 # via -r requirements.in -werkzeug==3.0.6 +werkzeug==3.1.3 # via + # azure-functions # flask # quart wrapt==1.16.0 diff --git a/app/functions/document_extractor/requirements.txt b/app/functions/document_extractor/requirements.txt index 49ab46bcaa..8cd04cf565 100644 --- a/app/functions/document_extractor/requirements.txt +++ b/app/functions/document_extractor/requirements.txt @@ -1,14 +1,461 @@ -azure-functions>=1.21.3,<2.0.0 -azure-identity -azure-core -azure-storage-blob -azure-storage-file-datalake -azure-search-documents==11.7.0b1 +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o requirements.txt --python-version 3.9 +aiofiles==24.1.0 + # via + # prompty + # quart +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.14 + # via + # -r requirements.in + # microsoft-kiota-authentication-azure +aiosignal==1.4.0 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +anyio==4.4.0 + # via + # httpx + # openai +asgiref==3.10.0 + # via opentelemetry-instrumentation-asgi +async-timeout==5.0.1 + # via aiohttp +attrs==25.3.0 + # via aiohttp azure-ai-documentintelligence==1.0.0b4 -typing-extensions -beautifulsoup4 -pillow -PyMuPDF -pypdf -tenacity -rich + # via -r requirements.in +azure-cognitiveservices-speech==1.40.0 + # via -r requirements.in +azure-common==1.1.28 + # via azure-search-documents +azure-core==1.35.0 + # via + # azure-ai-documentintelligence + # azure-core-tracing-opentelemetry + # azure-cosmos + # azure-identity + # azure-monitor-opentelemetry + # azure-monitor-opentelemetry-exporter + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # microsoft-kiota-authentication-azure + # msrest +azure-core-tracing-opentelemetry==1.0.0b11 + # via azure-monitor-opentelemetry +azure-cosmos==4.9.0 + # via -r requirements.in +azure-functions==1.24.0 + # via -r requirements.in +azure-identity==1.17.1 + # via + # -r requirements.in + # azure-monitor-opentelemetry-exporter + # msgraph-sdk +azure-monitor-opentelemetry==1.8.1 + # via -r requirements.in +azure-monitor-opentelemetry-exporter==1.0.0b44 + # via azure-monitor-opentelemetry +azure-search-documents==11.7.0b1 + # via -r requirements.in +azure-storage-blob==12.22.0 + # via + # -r requirements.in + # azure-storage-file-datalake +azure-storage-file-datalake==12.16.0 + # via -r requirements.in +beautifulsoup4==4.12.3 + # via -r requirements.in +blinker==1.8.2 + # via + # flask + # quart +certifi==2024.7.4 + # via + # httpcore + # httpx + # msrest + # requests +cffi==1.17.0 + # via cryptography +charset-normalizer==3.3.2 + # via requests +click==8.1.8 + # via + # flask + # prompty + # quart + # uvicorn +cryptography==44.0.1 + # via + # -r requirements.in + # azure-identity + # azure-storage-blob + # msal + # pyjwt +distro==1.9.0 + # via openai +exceptiongroup==1.3.0 + # via + # anyio + # hypercorn + # taskgroup +fixedint==0.1.6 + # via azure-monitor-opentelemetry-exporter +flask==3.0.3 + # via quart +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +h11==0.16.0 + # via + # httpcore + # hypercorn + # uvicorn + # wsproto +h2==4.3.0 + # via + # httpx + # hypercorn +hpack==4.1.0 + # via h2 +httpcore==1.0.9 + # via httpx +httpx==0.27.0 + # via + # microsoft-kiota-http + # msgraph-core + # openai +hypercorn==0.17.3 + # via quart +hyperframe==6.1.0 + # via h2 +idna==3.10 + # via + # anyio + # httpx + # requests + # yarl +importlib-metadata==8.0.0 + # via + # flask + # opentelemetry-api + # quart +isodate==0.6.1 + # via + # azure-ai-documentintelligence + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # msrest +itsdangerous==2.2.0 + # via + # flask + # quart +jinja2==3.1.6 + # via + # flask + # prompty + # quart +jiter==0.11.0 + # via openai +markdown-it-py==3.0.0 + # via rich +markupsafe==3.0.3 + # via + # jinja2 + # quart + # werkzeug +mdurl==0.1.2 + # via markdown-it-py +microsoft-kiota-abstractions==1.9.3 + # via + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # microsoft-kiota-serialization-form + # microsoft-kiota-serialization-json + # microsoft-kiota-serialization-multipart + # microsoft-kiota-serialization-text + # msgraph-core +microsoft-kiota-authentication-azure==1.9.3 + # via msgraph-core +microsoft-kiota-http==1.9.3 + # via msgraph-core +microsoft-kiota-serialization-form==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-json==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-multipart==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-text==1.9.3 + # via msgraph-sdk +msal==1.33.0 + # via + # -r requirements.in + # azure-identity + # msal-extensions +msal-extensions==1.3.1 + # via azure-identity +msgraph-core==1.3.3 + # via msgraph-sdk +msgraph-sdk==1.45.0 + # via -r requirements.in +msrest==0.7.1 + # via azure-monitor-opentelemetry-exporter +multidict==6.7.0 + # via + # aiohttp + # yarl +oauthlib==3.3.1 + # via requests-oauthlib +openai==1.99.8 + # via -r requirements.in +opentelemetry-api==1.38.0 + # via + # azure-core-tracing-opentelemetry + # azure-monitor-opentelemetry-exporter + # microsoft-kiota-abstractions + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-psycopg2 + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-instrumentation==0.59b0 + # via + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-psycopg2 + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi +opentelemetry-instrumentation-aiohttp-client==0.59b0 + # via -r requirements.in +opentelemetry-instrumentation-asgi==0.59b0 + # via + # -r requirements.in + # opentelemetry-instrumentation-fastapi +opentelemetry-instrumentation-dbapi==0.59b0 + # via opentelemetry-instrumentation-psycopg2 +opentelemetry-instrumentation-django==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-fastapi==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-flask==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-httpx==0.59b0 + # via -r requirements.in +opentelemetry-instrumentation-openai==0.47.5 + # via -r requirements.in +opentelemetry-instrumentation-psycopg2==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-requests==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-urllib==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-urllib3==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-wsgi==0.59b0 + # via + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-flask +opentelemetry-resource-detector-azure==0.1.5 + # via azure-monitor-opentelemetry +opentelemetry-sdk==1.38.0 + # via + # azure-monitor-opentelemetry + # azure-monitor-opentelemetry-exporter + # microsoft-kiota-abstractions + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # opentelemetry-resource-detector-azure +opentelemetry-semantic-conventions==0.59b0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk +opentelemetry-semantic-conventions-ai==0.4.13 + # via opentelemetry-instrumentation-openai +opentelemetry-util-http==0.59b0 + # via + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi +packaging==24.1 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-flask +pillow==10.4.0 + # via -r requirements.in +priority==2.0.0 + # via hypercorn +prompty==0.1.50 + # via -r requirements.in +propcache==0.2.0 + # via + # aiohttp + # yarl +psutil==7.1.2 + # via azure-monitor-opentelemetry-exporter +pycparser==2.22 + # via cffi +pydantic==2.12.3 + # via openai +pydantic-core==2.41.4 + # via pydantic +pygments==2.19.2 + # via rich +pyjwt==2.10.1 + # via + # -r requirements.in + # msal +pymupdf==1.26.0 + # via -r requirements.in +pypdf==6.1.3 + # via -r requirements.in +python-dotenv==1.1.1 + # via + # -r requirements.in + # prompty +pyyaml==6.0.2 + # via prompty +quart==0.20.0 + # via + # -r requirements.in + # quart-cors +quart-cors==0.7.0 + # via -r requirements.in +regex==2025.7.34 + # via tiktoken +requests==2.32.4 + # via + # azure-core + # msal + # msrest + # requests-oauthlib + # tiktoken +requests-oauthlib==2.0.0 + # via msrest +rich==14.1.0 + # via -r requirements.in +six==1.16.0 + # via + # azure-core + # isodate +sniffio==1.3.1 + # via + # anyio + # httpx + # openai +soupsieve==2.7 + # via beautifulsoup4 +std-uritemplate==2.0.5 + # via microsoft-kiota-abstractions +taskgroup==0.2.2 + # via hypercorn +tenacity==9.1.2 + # via -r requirements.in +tiktoken==0.8.0 + # via -r requirements.in +tomli==2.2.1 + # via hypercorn +tqdm==4.66.5 + # via openai +types-beautifulsoup4==4.12.0.20240511 + # via -r requirements.in +types-html5lib==1.1.11.20241018 + # via types-beautifulsoup4 +types-pillow==10.2.0.20240822 + # via -r requirements.in +typing-extensions==4.15.0 + # via + # -r requirements.in + # aiosignal + # anyio + # asgiref + # azure-ai-documentintelligence + # azure-core + # azure-cosmos + # azure-identity + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # exceptiongroup + # hypercorn + # multidict + # openai + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pydantic + # pydantic-core + # pypdf + # quart + # quart-cors + # taskgroup + # typing-inspection + # uvicorn +typing-inspection==0.4.2 + # via pydantic +urllib3==2.5.0 + # via requests +uvicorn==0.30.6 + # via -r requirements.in +werkzeug==3.1.3 + # via + # azure-functions + # flask + # quart +wrapt==1.16.0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-urllib3 +wsproto==1.2.0 + # via hypercorn +yarl==1.17.2 + # via aiohttp +zipp==3.21.0 + # via importlib-metadata diff --git a/app/functions/figure_processor/function_app.py b/app/functions/figure_processor/function_app.py index a883220110..0c1fcfcb38 100644 --- a/app/functions/figure_processor/function_app.py +++ b/app/functions/figure_processor/function_app.py @@ -67,7 +67,7 @@ BLOB_MANAGER = setup_blob_manager( storage_account=AZURE_STORAGE_ACCOUNT, storage_container=IMAGE_CONTAINER, - credential=GLOBAL_CREDENTIAL, + azure_credential=GLOBAL_CREDENTIAL, image_storage_container=IMAGE_CONTAINER, ) else: @@ -118,7 +118,7 @@ @app.function_name(name="process_figure") -@app.route(route="process", methods=["POST"]) +@app.route(route="process", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS) async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse: """Entrypoint for Azure Search custom skill calls.""" diff --git a/app/functions/figure_processor/requirements.txt b/app/functions/figure_processor/requirements.txt index 49ab46bcaa..8cd04cf565 100644 --- a/app/functions/figure_processor/requirements.txt +++ b/app/functions/figure_processor/requirements.txt @@ -1,14 +1,461 @@ -azure-functions>=1.21.3,<2.0.0 -azure-identity -azure-core -azure-storage-blob -azure-storage-file-datalake -azure-search-documents==11.7.0b1 +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o requirements.txt --python-version 3.9 +aiofiles==24.1.0 + # via + # prompty + # quart +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.14 + # via + # -r requirements.in + # microsoft-kiota-authentication-azure +aiosignal==1.4.0 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +anyio==4.4.0 + # via + # httpx + # openai +asgiref==3.10.0 + # via opentelemetry-instrumentation-asgi +async-timeout==5.0.1 + # via aiohttp +attrs==25.3.0 + # via aiohttp azure-ai-documentintelligence==1.0.0b4 -typing-extensions -beautifulsoup4 -pillow -PyMuPDF -pypdf -tenacity -rich + # via -r requirements.in +azure-cognitiveservices-speech==1.40.0 + # via -r requirements.in +azure-common==1.1.28 + # via azure-search-documents +azure-core==1.35.0 + # via + # azure-ai-documentintelligence + # azure-core-tracing-opentelemetry + # azure-cosmos + # azure-identity + # azure-monitor-opentelemetry + # azure-monitor-opentelemetry-exporter + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # microsoft-kiota-authentication-azure + # msrest +azure-core-tracing-opentelemetry==1.0.0b11 + # via azure-monitor-opentelemetry +azure-cosmos==4.9.0 + # via -r requirements.in +azure-functions==1.24.0 + # via -r requirements.in +azure-identity==1.17.1 + # via + # -r requirements.in + # azure-monitor-opentelemetry-exporter + # msgraph-sdk +azure-monitor-opentelemetry==1.8.1 + # via -r requirements.in +azure-monitor-opentelemetry-exporter==1.0.0b44 + # via azure-monitor-opentelemetry +azure-search-documents==11.7.0b1 + # via -r requirements.in +azure-storage-blob==12.22.0 + # via + # -r requirements.in + # azure-storage-file-datalake +azure-storage-file-datalake==12.16.0 + # via -r requirements.in +beautifulsoup4==4.12.3 + # via -r requirements.in +blinker==1.8.2 + # via + # flask + # quart +certifi==2024.7.4 + # via + # httpcore + # httpx + # msrest + # requests +cffi==1.17.0 + # via cryptography +charset-normalizer==3.3.2 + # via requests +click==8.1.8 + # via + # flask + # prompty + # quart + # uvicorn +cryptography==44.0.1 + # via + # -r requirements.in + # azure-identity + # azure-storage-blob + # msal + # pyjwt +distro==1.9.0 + # via openai +exceptiongroup==1.3.0 + # via + # anyio + # hypercorn + # taskgroup +fixedint==0.1.6 + # via azure-monitor-opentelemetry-exporter +flask==3.0.3 + # via quart +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +h11==0.16.0 + # via + # httpcore + # hypercorn + # uvicorn + # wsproto +h2==4.3.0 + # via + # httpx + # hypercorn +hpack==4.1.0 + # via h2 +httpcore==1.0.9 + # via httpx +httpx==0.27.0 + # via + # microsoft-kiota-http + # msgraph-core + # openai +hypercorn==0.17.3 + # via quart +hyperframe==6.1.0 + # via h2 +idna==3.10 + # via + # anyio + # httpx + # requests + # yarl +importlib-metadata==8.0.0 + # via + # flask + # opentelemetry-api + # quart +isodate==0.6.1 + # via + # azure-ai-documentintelligence + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # msrest +itsdangerous==2.2.0 + # via + # flask + # quart +jinja2==3.1.6 + # via + # flask + # prompty + # quart +jiter==0.11.0 + # via openai +markdown-it-py==3.0.0 + # via rich +markupsafe==3.0.3 + # via + # jinja2 + # quart + # werkzeug +mdurl==0.1.2 + # via markdown-it-py +microsoft-kiota-abstractions==1.9.3 + # via + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # microsoft-kiota-serialization-form + # microsoft-kiota-serialization-json + # microsoft-kiota-serialization-multipart + # microsoft-kiota-serialization-text + # msgraph-core +microsoft-kiota-authentication-azure==1.9.3 + # via msgraph-core +microsoft-kiota-http==1.9.3 + # via msgraph-core +microsoft-kiota-serialization-form==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-json==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-multipart==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-text==1.9.3 + # via msgraph-sdk +msal==1.33.0 + # via + # -r requirements.in + # azure-identity + # msal-extensions +msal-extensions==1.3.1 + # via azure-identity +msgraph-core==1.3.3 + # via msgraph-sdk +msgraph-sdk==1.45.0 + # via -r requirements.in +msrest==0.7.1 + # via azure-monitor-opentelemetry-exporter +multidict==6.7.0 + # via + # aiohttp + # yarl +oauthlib==3.3.1 + # via requests-oauthlib +openai==1.99.8 + # via -r requirements.in +opentelemetry-api==1.38.0 + # via + # azure-core-tracing-opentelemetry + # azure-monitor-opentelemetry-exporter + # microsoft-kiota-abstractions + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-psycopg2 + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-instrumentation==0.59b0 + # via + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-psycopg2 + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi +opentelemetry-instrumentation-aiohttp-client==0.59b0 + # via -r requirements.in +opentelemetry-instrumentation-asgi==0.59b0 + # via + # -r requirements.in + # opentelemetry-instrumentation-fastapi +opentelemetry-instrumentation-dbapi==0.59b0 + # via opentelemetry-instrumentation-psycopg2 +opentelemetry-instrumentation-django==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-fastapi==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-flask==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-httpx==0.59b0 + # via -r requirements.in +opentelemetry-instrumentation-openai==0.47.5 + # via -r requirements.in +opentelemetry-instrumentation-psycopg2==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-requests==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-urllib==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-urllib3==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-wsgi==0.59b0 + # via + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-flask +opentelemetry-resource-detector-azure==0.1.5 + # via azure-monitor-opentelemetry +opentelemetry-sdk==1.38.0 + # via + # azure-monitor-opentelemetry + # azure-monitor-opentelemetry-exporter + # microsoft-kiota-abstractions + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # opentelemetry-resource-detector-azure +opentelemetry-semantic-conventions==0.59b0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk +opentelemetry-semantic-conventions-ai==0.4.13 + # via opentelemetry-instrumentation-openai +opentelemetry-util-http==0.59b0 + # via + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi +packaging==24.1 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-flask +pillow==10.4.0 + # via -r requirements.in +priority==2.0.0 + # via hypercorn +prompty==0.1.50 + # via -r requirements.in +propcache==0.2.0 + # via + # aiohttp + # yarl +psutil==7.1.2 + # via azure-monitor-opentelemetry-exporter +pycparser==2.22 + # via cffi +pydantic==2.12.3 + # via openai +pydantic-core==2.41.4 + # via pydantic +pygments==2.19.2 + # via rich +pyjwt==2.10.1 + # via + # -r requirements.in + # msal +pymupdf==1.26.0 + # via -r requirements.in +pypdf==6.1.3 + # via -r requirements.in +python-dotenv==1.1.1 + # via + # -r requirements.in + # prompty +pyyaml==6.0.2 + # via prompty +quart==0.20.0 + # via + # -r requirements.in + # quart-cors +quart-cors==0.7.0 + # via -r requirements.in +regex==2025.7.34 + # via tiktoken +requests==2.32.4 + # via + # azure-core + # msal + # msrest + # requests-oauthlib + # tiktoken +requests-oauthlib==2.0.0 + # via msrest +rich==14.1.0 + # via -r requirements.in +six==1.16.0 + # via + # azure-core + # isodate +sniffio==1.3.1 + # via + # anyio + # httpx + # openai +soupsieve==2.7 + # via beautifulsoup4 +std-uritemplate==2.0.5 + # via microsoft-kiota-abstractions +taskgroup==0.2.2 + # via hypercorn +tenacity==9.1.2 + # via -r requirements.in +tiktoken==0.8.0 + # via -r requirements.in +tomli==2.2.1 + # via hypercorn +tqdm==4.66.5 + # via openai +types-beautifulsoup4==4.12.0.20240511 + # via -r requirements.in +types-html5lib==1.1.11.20241018 + # via types-beautifulsoup4 +types-pillow==10.2.0.20240822 + # via -r requirements.in +typing-extensions==4.15.0 + # via + # -r requirements.in + # aiosignal + # anyio + # asgiref + # azure-ai-documentintelligence + # azure-core + # azure-cosmos + # azure-identity + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # exceptiongroup + # hypercorn + # multidict + # openai + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pydantic + # pydantic-core + # pypdf + # quart + # quart-cors + # taskgroup + # typing-inspection + # uvicorn +typing-inspection==0.4.2 + # via pydantic +urllib3==2.5.0 + # via requests +uvicorn==0.30.6 + # via -r requirements.in +werkzeug==3.1.3 + # via + # azure-functions + # flask + # quart +wrapt==1.16.0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-urllib3 +wsproto==1.2.0 + # via hypercorn +yarl==1.17.2 + # via aiohttp +zipp==3.21.0 + # via importlib-metadata diff --git a/app/functions/text_processor/requirements.txt b/app/functions/text_processor/requirements.txt index 49ab46bcaa..8cd04cf565 100644 --- a/app/functions/text_processor/requirements.txt +++ b/app/functions/text_processor/requirements.txt @@ -1,14 +1,461 @@ -azure-functions>=1.21.3,<2.0.0 -azure-identity -azure-core -azure-storage-blob -azure-storage-file-datalake -azure-search-documents==11.7.0b1 +# This file was autogenerated by uv via the following command: +# uv pip compile requirements.in -o requirements.txt --python-version 3.9 +aiofiles==24.1.0 + # via + # prompty + # quart +aiohappyeyeballs==2.6.1 + # via aiohttp +aiohttp==3.12.14 + # via + # -r requirements.in + # microsoft-kiota-authentication-azure +aiosignal==1.4.0 + # via aiohttp +annotated-types==0.7.0 + # via pydantic +anyio==4.4.0 + # via + # httpx + # openai +asgiref==3.10.0 + # via opentelemetry-instrumentation-asgi +async-timeout==5.0.1 + # via aiohttp +attrs==25.3.0 + # via aiohttp azure-ai-documentintelligence==1.0.0b4 -typing-extensions -beautifulsoup4 -pillow -PyMuPDF -pypdf -tenacity -rich + # via -r requirements.in +azure-cognitiveservices-speech==1.40.0 + # via -r requirements.in +azure-common==1.1.28 + # via azure-search-documents +azure-core==1.35.0 + # via + # azure-ai-documentintelligence + # azure-core-tracing-opentelemetry + # azure-cosmos + # azure-identity + # azure-monitor-opentelemetry + # azure-monitor-opentelemetry-exporter + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # microsoft-kiota-authentication-azure + # msrest +azure-core-tracing-opentelemetry==1.0.0b11 + # via azure-monitor-opentelemetry +azure-cosmos==4.9.0 + # via -r requirements.in +azure-functions==1.24.0 + # via -r requirements.in +azure-identity==1.17.1 + # via + # -r requirements.in + # azure-monitor-opentelemetry-exporter + # msgraph-sdk +azure-monitor-opentelemetry==1.8.1 + # via -r requirements.in +azure-monitor-opentelemetry-exporter==1.0.0b44 + # via azure-monitor-opentelemetry +azure-search-documents==11.7.0b1 + # via -r requirements.in +azure-storage-blob==12.22.0 + # via + # -r requirements.in + # azure-storage-file-datalake +azure-storage-file-datalake==12.16.0 + # via -r requirements.in +beautifulsoup4==4.12.3 + # via -r requirements.in +blinker==1.8.2 + # via + # flask + # quart +certifi==2024.7.4 + # via + # httpcore + # httpx + # msrest + # requests +cffi==1.17.0 + # via cryptography +charset-normalizer==3.3.2 + # via requests +click==8.1.8 + # via + # flask + # prompty + # quart + # uvicorn +cryptography==44.0.1 + # via + # -r requirements.in + # azure-identity + # azure-storage-blob + # msal + # pyjwt +distro==1.9.0 + # via openai +exceptiongroup==1.3.0 + # via + # anyio + # hypercorn + # taskgroup +fixedint==0.1.6 + # via azure-monitor-opentelemetry-exporter +flask==3.0.3 + # via quart +frozenlist==1.4.1 + # via + # aiohttp + # aiosignal +h11==0.16.0 + # via + # httpcore + # hypercorn + # uvicorn + # wsproto +h2==4.3.0 + # via + # httpx + # hypercorn +hpack==4.1.0 + # via h2 +httpcore==1.0.9 + # via httpx +httpx==0.27.0 + # via + # microsoft-kiota-http + # msgraph-core + # openai +hypercorn==0.17.3 + # via quart +hyperframe==6.1.0 + # via h2 +idna==3.10 + # via + # anyio + # httpx + # requests + # yarl +importlib-metadata==8.0.0 + # via + # flask + # opentelemetry-api + # quart +isodate==0.6.1 + # via + # azure-ai-documentintelligence + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # msrest +itsdangerous==2.2.0 + # via + # flask + # quart +jinja2==3.1.6 + # via + # flask + # prompty + # quart +jiter==0.11.0 + # via openai +markdown-it-py==3.0.0 + # via rich +markupsafe==3.0.3 + # via + # jinja2 + # quart + # werkzeug +mdurl==0.1.2 + # via markdown-it-py +microsoft-kiota-abstractions==1.9.3 + # via + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # microsoft-kiota-serialization-form + # microsoft-kiota-serialization-json + # microsoft-kiota-serialization-multipart + # microsoft-kiota-serialization-text + # msgraph-core +microsoft-kiota-authentication-azure==1.9.3 + # via msgraph-core +microsoft-kiota-http==1.9.3 + # via msgraph-core +microsoft-kiota-serialization-form==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-json==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-multipart==1.9.3 + # via msgraph-sdk +microsoft-kiota-serialization-text==1.9.3 + # via msgraph-sdk +msal==1.33.0 + # via + # -r requirements.in + # azure-identity + # msal-extensions +msal-extensions==1.3.1 + # via azure-identity +msgraph-core==1.3.3 + # via msgraph-sdk +msgraph-sdk==1.45.0 + # via -r requirements.in +msrest==0.7.1 + # via azure-monitor-opentelemetry-exporter +multidict==6.7.0 + # via + # aiohttp + # yarl +oauthlib==3.3.1 + # via requests-oauthlib +openai==1.99.8 + # via -r requirements.in +opentelemetry-api==1.38.0 + # via + # azure-core-tracing-opentelemetry + # azure-monitor-opentelemetry-exporter + # microsoft-kiota-abstractions + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-psycopg2 + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk + # opentelemetry-semantic-conventions +opentelemetry-instrumentation==0.59b0 + # via + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-psycopg2 + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi +opentelemetry-instrumentation-aiohttp-client==0.59b0 + # via -r requirements.in +opentelemetry-instrumentation-asgi==0.59b0 + # via + # -r requirements.in + # opentelemetry-instrumentation-fastapi +opentelemetry-instrumentation-dbapi==0.59b0 + # via opentelemetry-instrumentation-psycopg2 +opentelemetry-instrumentation-django==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-fastapi==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-flask==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-httpx==0.59b0 + # via -r requirements.in +opentelemetry-instrumentation-openai==0.47.5 + # via -r requirements.in +opentelemetry-instrumentation-psycopg2==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-requests==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-urllib==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-urllib3==0.59b0 + # via azure-monitor-opentelemetry +opentelemetry-instrumentation-wsgi==0.59b0 + # via + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-flask +opentelemetry-resource-detector-azure==0.1.5 + # via azure-monitor-opentelemetry +opentelemetry-sdk==1.38.0 + # via + # azure-monitor-opentelemetry + # azure-monitor-opentelemetry-exporter + # microsoft-kiota-abstractions + # microsoft-kiota-authentication-azure + # microsoft-kiota-http + # opentelemetry-resource-detector-azure +opentelemetry-semantic-conventions==0.59b0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-openai + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi + # opentelemetry-sdk +opentelemetry-semantic-conventions-ai==0.4.13 + # via opentelemetry-instrumentation-openai +opentelemetry-util-http==0.59b0 + # via + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-asgi + # opentelemetry-instrumentation-django + # opentelemetry-instrumentation-fastapi + # opentelemetry-instrumentation-flask + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-requests + # opentelemetry-instrumentation-urllib + # opentelemetry-instrumentation-urllib3 + # opentelemetry-instrumentation-wsgi +packaging==24.1 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-flask +pillow==10.4.0 + # via -r requirements.in +priority==2.0.0 + # via hypercorn +prompty==0.1.50 + # via -r requirements.in +propcache==0.2.0 + # via + # aiohttp + # yarl +psutil==7.1.2 + # via azure-monitor-opentelemetry-exporter +pycparser==2.22 + # via cffi +pydantic==2.12.3 + # via openai +pydantic-core==2.41.4 + # via pydantic +pygments==2.19.2 + # via rich +pyjwt==2.10.1 + # via + # -r requirements.in + # msal +pymupdf==1.26.0 + # via -r requirements.in +pypdf==6.1.3 + # via -r requirements.in +python-dotenv==1.1.1 + # via + # -r requirements.in + # prompty +pyyaml==6.0.2 + # via prompty +quart==0.20.0 + # via + # -r requirements.in + # quart-cors +quart-cors==0.7.0 + # via -r requirements.in +regex==2025.7.34 + # via tiktoken +requests==2.32.4 + # via + # azure-core + # msal + # msrest + # requests-oauthlib + # tiktoken +requests-oauthlib==2.0.0 + # via msrest +rich==14.1.0 + # via -r requirements.in +six==1.16.0 + # via + # azure-core + # isodate +sniffio==1.3.1 + # via + # anyio + # httpx + # openai +soupsieve==2.7 + # via beautifulsoup4 +std-uritemplate==2.0.5 + # via microsoft-kiota-abstractions +taskgroup==0.2.2 + # via hypercorn +tenacity==9.1.2 + # via -r requirements.in +tiktoken==0.8.0 + # via -r requirements.in +tomli==2.2.1 + # via hypercorn +tqdm==4.66.5 + # via openai +types-beautifulsoup4==4.12.0.20240511 + # via -r requirements.in +types-html5lib==1.1.11.20241018 + # via types-beautifulsoup4 +types-pillow==10.2.0.20240822 + # via -r requirements.in +typing-extensions==4.15.0 + # via + # -r requirements.in + # aiosignal + # anyio + # asgiref + # azure-ai-documentintelligence + # azure-core + # azure-cosmos + # azure-identity + # azure-search-documents + # azure-storage-blob + # azure-storage-file-datalake + # exceptiongroup + # hypercorn + # multidict + # openai + # opentelemetry-api + # opentelemetry-sdk + # opentelemetry-semantic-conventions + # pydantic + # pydantic-core + # pypdf + # quart + # quart-cors + # taskgroup + # typing-inspection + # uvicorn +typing-inspection==0.4.2 + # via pydantic +urllib3==2.5.0 + # via requests +uvicorn==0.30.6 + # via -r requirements.in +werkzeug==3.1.3 + # via + # azure-functions + # flask + # quart +wrapt==1.16.0 + # via + # opentelemetry-instrumentation + # opentelemetry-instrumentation-aiohttp-client + # opentelemetry-instrumentation-dbapi + # opentelemetry-instrumentation-httpx + # opentelemetry-instrumentation-urllib3 +wsproto==1.2.0 + # via hypercorn +yarl==1.17.2 + # via aiohttp +zipp==3.21.0 + # via importlib-metadata diff --git a/azure.yaml b/azure.yaml index 46f0167363..24389c5af6 100644 --- a/azure.yaml +++ b/azure.yaml @@ -48,12 +48,12 @@ services: prepackage: windows: shell: pwsh - run: python ../../../scripts/copy_prepdocslib.py;python -m pip install --upgrade pip;python -m pip install -r requirements.txt -t .python_packages/lib/site-packages + run: python ../../../scripts/copy_prepdocslib.py interactive: false continueOnError: false posix: shell: sh - run: python ../../../scripts/copy_prepdocslib.py && python -m pip install --upgrade pip && python -m pip install -r requirements.txt -t .python_packages/lib/site-packages + run: python ../../../scripts/copy_prepdocslib.py interactive: false continueOnError: false figure-processor: @@ -64,12 +64,12 @@ services: prepackage: windows: shell: pwsh - run: python ../../../scripts/copy_prepdocslib.py;python -m pip install --upgrade pip;python -m pip install -r requirements.txt -t .python_packages/lib/site-packages + run: python ../../../scripts/copy_prepdocslib.py interactive: false continueOnError: false posix: shell: sh - run: python ../../../scripts/copy_prepdocslib.py && python -m pip install --upgrade pip && python -m pip install -r requirements.txt -t .python_packages/lib/site-packages + run: python ../../../scripts/copy_prepdocslib.py interactive: false continueOnError: false text-processor: @@ -80,12 +80,12 @@ services: prepackage: windows: shell: pwsh - run: python ../../../scripts/copy_prepdocslib.py;python -m pip install --upgrade pip;python -m pip install -r requirements.txt -t .python_packages/lib/site-packages + run: python ../../../scripts/copy_prepdocslib.py interactive: false continueOnError: false posix: shell: sh - run: python ../../../scripts/copy_prepdocslib.py && python -m pip install --upgrade pip && python -m pip install -r requirements.txt -t .python_packages/lib/site-packages + run: python ../../../scripts/copy_prepdocslib.py interactive: false continueOnError: false hooks: diff --git a/infra/app/functions.bicep b/infra/app/functions.bicep index 75cf78b9f0..ba1e8dd390 100644 --- a/infra/app/functions.bicep +++ b/infra/app/functions.bicep @@ -41,6 +41,29 @@ param openAiCustomUrl string var abbrs = loadJsonContent('../abbreviations.json') var resourceToken = toLower(uniqueString(subscription().id, resourceGroup().id, location)) +var documentExtractorRuntimeStorageName = '${abbrs.storageStorageAccounts}doc${take(resourceToken, 18)}' +var figureProcessorRuntimeStorageName = '${abbrs.storageStorageAccounts}fig${take(resourceToken, 18)}' +var textProcessorRuntimeStorageName = '${abbrs.storageStorageAccounts}txt${take(resourceToken, 18)}' + +var documentExtractorHostId = 'doc-skill-${take(resourceToken, 12)}' +var figureProcessorHostId = 'fig-skill-${take(resourceToken, 12)}' +var textProcessorHostId = 'txt-skill-${take(resourceToken, 12)}' + +var runtimeStorageRoles = [ + { + suffix: 'blob' + roleDefinitionId: 'ba92f5b4-2d11-453d-a403-e96b0029c9fe' + } + { + suffix: 'queue' + roleDefinitionId: '974c5e8b-45b9-4653-ba55-5f855dd0fb88' + } + { + suffix: 'table' + roleDefinitionId: '0a9a7e1f-b9d0-4cc4-a60d-0319b160aaa3' + } +] + // Common app settings for both functions var commonAppSettings = { // Storage @@ -92,21 +115,103 @@ var documentExtractorDeploymentContainer = 'deploy-doc-extractor-${take(resource var figureProcessorDeploymentContainer = 'deploy-figure-processor-${take(resourceToken, 7)}' var textProcessorDeploymentContainer = 'deploy-text-processor-${take(resourceToken, 7)}' -// Create deployment containers in storage account -// Create deployment containers via cross-scope module (avoids re-deploying storage account configuration) -module deploymentContainers 'storage-containers.bicep' = { - name: 'function-deployment-containers' - scope: resourceGroup(storageResourceGroupName) +// Runtime storage accounts per function (Flex Consumption requirement) +module documentExtractorRuntimeStorageAccount '../core/storage/storage-account.bicep' = { + name: 'doc-extractor-runtime-storage' + params: { + name: documentExtractorRuntimeStorageName + location: location + tags: tags + allowBlobPublicAccess: false + containers: [ + { + name: documentExtractorDeploymentContainer + } + ] + } +} + +module figureProcessorRuntimeStorageAccount '../core/storage/storage-account.bicep' = { + name: 'figure-processor-runtime-storage' params: { - storageAccountName: storageAccountName - containerNames: [ - documentExtractorDeploymentContainer - figureProcessorDeploymentContainer - textProcessorDeploymentContainer + name: figureProcessorRuntimeStorageName + location: location + tags: tags + allowBlobPublicAccess: false + containers: [ + { + name: figureProcessorDeploymentContainer + } + ] + } +} + +module textProcessorRuntimeStorageAccount '../core/storage/storage-account.bicep' = { + name: 'text-processor-runtime-storage' + params: { + name: textProcessorRuntimeStorageName + location: location + tags: tags + allowBlobPublicAccess: false + containers: [ + { + name: textProcessorDeploymentContainer + } ] } } +resource documentExtractorRuntimeStorage 'Microsoft.Storage/storageAccounts@2024-01-01' existing = { + name: documentExtractorRuntimeStorageName +} + +resource figureProcessorRuntimeStorage 'Microsoft.Storage/storageAccounts@2024-01-01' existing = { + name: figureProcessorRuntimeStorageName +} + +resource textProcessorRuntimeStorage 'Microsoft.Storage/storageAccounts@2024-01-01' existing = { + name: textProcessorRuntimeStorageName +} + +resource documentExtractorRuntimeStorageRoles 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for role in runtimeStorageRoles: { + name: guid(documentExtractorRuntimeStorage.id, role.roleDefinitionId, 'doc-runtime') + scope: documentExtractorRuntimeStorage + properties: { + principalId: documentExtractorIdentity.outputs.principalId + principalType: 'ServicePrincipal' + roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', role.roleDefinitionId) + } + dependsOn: [ + documentExtractorRuntimeStorageAccount + ] +}] + +resource figureProcessorRuntimeStorageRoles 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for role in runtimeStorageRoles: { + name: guid(figureProcessorRuntimeStorage.id, role.roleDefinitionId, 'figure-runtime') + scope: figureProcessorRuntimeStorage + properties: { + principalId: figureProcessorIdentity.outputs.principalId + principalType: 'ServicePrincipal' + roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', role.roleDefinitionId) + } + dependsOn: [ + figureProcessorRuntimeStorageAccount + ] +}] + +resource textProcessorRuntimeStorageRoles 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for role in runtimeStorageRoles: { + name: guid(textProcessorRuntimeStorage.id, role.roleDefinitionId, 'text-runtime') + scope: textProcessorRuntimeStorage + properties: { + principalId: textProcessorIdentity.outputs.principalId + principalType: 'ServicePrincipal' + roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', role.roleDefinitionId) + } + dependsOn: [ + textProcessorRuntimeStorageAccount + ] +}] + // User-assigned managed identity for document extractor module documentExtractorIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.4.1' = { name: 'doc-extractor-identity' @@ -204,11 +309,13 @@ module documentExtractor 'functions-app.bicep' = { appServicePlanId: documentExtractorPlan.outputs.resourceId runtimeName: 'python' runtimeVersion: '3.11' - storageAccountName: storageAccountName + storageAccountName: documentExtractorRuntimeStorageName deploymentStorageContainerName: documentExtractorDeploymentContainer identityId: documentExtractorIdentity.outputs.resourceId identityClientId: documentExtractorIdentity.outputs.clientId - appSettings: allAppSettings + appSettings: union(allAppSettings, { + AzureFunctionsWebHost__hostid: documentExtractorHostId + }) instanceMemoryMB: 4096 // High memory for document processing maximumInstanceCount: 100 // Removed unused functionTimeout parameter; configured defaults via host settings @@ -216,7 +323,7 @@ module documentExtractor 'functions-app.bicep' = { skillAppAudience: 'api://${documentExtractorAppReg.outputs.clientAppId}' } dependsOn: [ - deploymentContainers + documentExtractorRuntimeStorageAccount ] } @@ -243,18 +350,20 @@ module figureProcessor 'functions-app.bicep' = { appServicePlanId: figureProcessorPlan.outputs.resourceId runtimeName: 'python' runtimeVersion: '3.11' - storageAccountName: storageAccountName + storageAccountName: figureProcessorRuntimeStorageName deploymentStorageContainerName: figureProcessorDeploymentContainer identityId: figureProcessorIdentity.outputs.resourceId identityClientId: figureProcessorIdentity.outputs.clientId - appSettings: allAppSettings + appSettings: union(allAppSettings, { + AzureFunctionsWebHost__hostid: figureProcessorHostId + }) instanceMemoryMB: 2048 maximumInstanceCount: 100 skillAppClientId: figureProcessorAppReg.outputs.clientAppId skillAppAudience: 'api://${figureProcessorAppReg.outputs.clientAppId}' } dependsOn: [ - deploymentContainers + figureProcessorRuntimeStorageAccount ] } @@ -281,18 +390,20 @@ module textProcessor 'functions-app.bicep' = { appServicePlanId: textProcessorPlan.outputs.resourceId runtimeName: 'python' runtimeVersion: '3.11' - storageAccountName: storageAccountName + storageAccountName: textProcessorRuntimeStorageName deploymentStorageContainerName: textProcessorDeploymentContainer identityId: textProcessorIdentity.outputs.resourceId identityClientId: textProcessorIdentity.outputs.clientId - appSettings: allAppSettings + appSettings: union(allAppSettings, { + AzureFunctionsWebHost__hostid: textProcessorHostId + }) instanceMemoryMB: 2048 // Standard memory for embedding maximumInstanceCount: 100 skillAppClientId: textProcessorAppReg.outputs.clientAppId skillAppAudience: 'api://${textProcessorAppReg.outputs.clientAppId}' } dependsOn: [ - deploymentContainers + textProcessorRuntimeStorageAccount ] } @@ -352,17 +463,23 @@ output documentExtractorName string = documentExtractor.outputs.name output documentExtractorUrl string = documentExtractor.outputs.defaultHostname output documentExtractorIdentityPrincipalId string = documentExtractorIdentity.outputs.principalId output documentExtractorClientAppId string = documentExtractorAppReg.outputs.clientAppId -output documentExtractorSkillResourceId string = 'api://${documentExtractorAppReg.outputs.clientAppId}' : documentExtractorAppReg.outputs.clientAppId +output documentExtractorSkillResourceId string = documentExtractorAppReg.outputs.clientAppId output figureProcessorName string = figureProcessor.outputs.name output figureProcessorUrl string = figureProcessor.outputs.defaultHostname output figureProcessorIdentityPrincipalId string = figureProcessorIdentity.outputs.principalId output figureProcessorClientAppId string = figureProcessorAppReg.outputs.clientAppId -output figureProcessorSkillResourceId string = 'api://${figureProcessorAppReg.outputs.clientAppId}' +output figureProcessorSkillResourceId string = figureProcessorAppReg.outputs.clientAppId output textProcessorName string = textProcessor.outputs.name output textProcessorUrl string = textProcessor.outputs.defaultHostname output textProcessorIdentityPrincipalId string = textProcessorIdentity.outputs.principalId output textProcessorClientAppId string = textProcessorAppReg.outputs.clientAppId -output textProcessorSkillResourceId string = 'api://${textProcessorAppReg.outputs.clientAppId}' +output textProcessorSkillResourceId string = textProcessorAppReg.outputs.clientAppId +output documentExtractorRuntimeStorageName string = documentExtractorRuntimeStorageName +output figureProcessorRuntimeStorageName string = figureProcessorRuntimeStorageName +output textProcessorRuntimeStorageName string = textProcessorRuntimeStorageName +output documentExtractorHostId string = documentExtractorHostId +output figureProcessorHostId string = figureProcessorHostId +output textProcessorHostId string = textProcessorHostId // Output the last plan id (text processor) for potential diagnostics; others can be added if needed output appServicePlanId string = textProcessorPlan.outputs.resourceId // Resource IDs for each function app (used for auth_resource_id with managed identity secured skills) diff --git a/scripts/copy_prepdocslib.py b/scripts/copy_prepdocslib.py index 540e97987c..30ec9fa38c 100644 --- a/scripts/copy_prepdocslib.py +++ b/scripts/copy_prepdocslib.py @@ -3,11 +3,6 @@ Steps: 1. Copy `prepdocslib` into each function directory. 2. Overwrite each function's `requirements.txt` with backend `requirements.txt`. -3. Copy backend requirements again as `requirements.backend.txt` for audit. - -No backups retained (per user request). The previous minimal requirements are -discarded. All functions now share identical pinned versions ensuring imports -like `azure.core` are available. """ from __future__ import annotations @@ -49,10 +44,6 @@ def main() -> None: overwrite_req = func_dir / "requirements.txt" shutil.copy2(backend_requirements, overwrite_req) - # 3. Copy backend requirements for explicit provenance - audit_req = func_dir / "requirements.backend.txt" - shutil.copy2(backend_requirements, audit_req) - if __name__ == "__main__": main() From 9ac595faab93ea47269d70b39a4ac557a846c275 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 4 Nov 2025 17:14:06 -0800 Subject: [PATCH 09/30] Updates to function auth --- .../figure_processor/function_app.py | 2 +- infra/app/functions-app.bicep | 50 ++++-- infra/app/functions-rbac.bicep | 20 +-- infra/app/functions.bicep | 157 ++++++------------ infra/core/auth/appregistration.bicep | 79 ++++++--- infra/core/auth/appupdate.bicep | 42 ----- infra/main.bicep | 12 +- 7 files changed, 151 insertions(+), 211 deletions(-) delete mode 100644 infra/core/auth/appupdate.bicep diff --git a/app/functions/figure_processor/function_app.py b/app/functions/figure_processor/function_app.py index 0c1fcfcb38..76eda4715d 100644 --- a/app/functions/figure_processor/function_app.py +++ b/app/functions/figure_processor/function_app.py @@ -118,7 +118,7 @@ @app.function_name(name="process_figure") -@app.route(route="process", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS) +@app.route(route="process", methods=["POST"]) async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse: """Entrypoint for Azure Search custom skill calls.""" diff --git a/infra/app/functions-app.bicep b/infra/app/functions-app.bicep index 13f29e9370..cd660f5ea9 100644 --- a/infra/app/functions-app.bicep +++ b/infra/app/functions-app.bicep @@ -14,10 +14,16 @@ param instanceMemoryMB int = 2048 param maximumInstanceCount int = 10 param identityId string param identityClientId string -// App Registration client ID (applicationId) used to secure Function App endpoints (Easy Auth) -param skillAppClientId string = '' -// Audience / identifier URI to validate tokens (e.g. api:///-skill) -param skillAppAudience string = '' + +// Authorization parameters +@description('The Entra ID application (client) ID for App Service Authentication') +param authClientId string + +@description('The Entra ID identifier URI for App Service Authentication') +param authIdentifierUri string + +@description('The Azure AD tenant ID for App Service Authentication') +param authTenantId string // AVM expects authentication.type values: SystemAssignedIdentity | UserAssignedIdentity | StorageAccountConnectionString // Use UserAssignedIdentity for per-function user-assigned managed identity deployment storage access. @@ -51,14 +57,12 @@ var appInsightsSettings = !empty(applicationInsightsName) ? { APPLICATIONINSIGHTS_CONNECTION_STRING: applicationInsights.?properties.ConnectionString ?? '' } : {} -// Surface skill application identifiers for downstream logging/diagnostics (not used for manual validation now that Easy Auth is enabled) -var skillAudienceSettings = (!empty(skillAppClientId) && !empty(skillAppAudience)) ? { - SKILL_APP_ID: skillAppClientId - SKILL_APP_AUDIENCE: skillAppAudience -} : {} +var easyAuthSettings = { + OVERRIDE_USE_MI_FIC_ASSERTION_CLIENTID: identityClientId +} // Merge all app settings -var allAppSettings = union(appSettings, baseAppSettings, appInsightsSettings, skillAudienceSettings) +var allAppSettings = union(appSettings, baseAppSettings, appInsightsSettings, easyAuthSettings) // Create Flex Consumption Function App using AVM module functionApp 'br/public:avm/res/web/site:0.15.1' = { @@ -70,7 +74,9 @@ module functionApp 'br/public:avm/res/web/site:0.15.1' = { tags: tags serverFarmResourceId: appServicePlanId managedIdentities: { - userAssignedResourceIds: [identityId] + userAssignedResourceIds: [ + '${identityId}' + ] } functionAppConfig: { deployment: { @@ -107,26 +113,38 @@ module functionApp 'br/public:avm/res/web/site:0.15.1' = { // Enable Easy Auth (App Service authentication) for Azure Search custom skill access when a skillAppId is provided. // Based on Microsoft guidance: require authentication, return 401 on unauthenticated, allowed audience api://{applicationId}. -resource auth 'Microsoft.Web/sites/config@2022-03-01' = if (!empty(skillAppClientId) && !empty(skillAppAudience)) { +resource auth 'Microsoft.Web/sites/config@2022-03-01' = { name: '${name}/authsettingsV2' + dependsOn: [ + functionApp // Ensure the Function App module completes before configuring authentication + ] properties: { globalValidation: { requireAuthentication: true unauthenticatedClientAction: 'Return401' + redirectToProvider: 'azureactivedirectory' } identityProviders: { azureActiveDirectory: { enabled: true registration: { - clientId: skillAppClientId + openIdIssuer: '${environment().authentication.loginEndpoint}${authTenantId}/v2.0' + clientId: authClientId + clientSecretSettingName: 'OVERRIDE_USE_MI_FIC_ASSERTION_CLIENTID' } validation: { - allowedAudiences: [ skillAppAudience ] + jwtClaimChecks: {} + allowedAudiences: [ + authIdentifierUri + ] + defaultAuthorizationPolicy: { + allowedPrincipals: {} + allowedApplications: [authClientId] + } } } } } - dependsOn: [ functionApp ] } // Outputs @@ -134,4 +152,4 @@ output name string = functionApp.outputs.name output defaultHostname string = functionApp.outputs.defaultHostname // Expose resourceId for downstream skill auth configuration output resourceId string = functionApp.outputs.resourceId -output authEnabled bool = !empty(skillAppClientId) && !empty(skillAppAudience) +output authEnabled bool = !empty(authClientId) && !empty(authIdentifierUri) diff --git a/infra/app/functions-rbac.bicep b/infra/app/functions-rbac.bicep index 72e39a6bf2..6a4e9df67f 100644 --- a/infra/app/functions-rbac.bicep +++ b/infra/app/functions-rbac.bicep @@ -23,7 +23,7 @@ var monitoringMetricsPublisherRoleId = '3913510d-42f4-4e42-8a64-420c390055eb' // // Storage: Blob Data Reader (read content container) module storageBlobReaderRole '../core/security/role.bicep' = { scope: resourceGroup(storageResourceGroupName) - name: 'storage-blob-reader-${uniqueString(principalId)}' + name: 'function-storage-blob-reader-${uniqueString(principalId)}' params: { principalId: principalId roleDefinitionId: storageBlobDataReaderRoleId @@ -34,7 +34,7 @@ module storageBlobReaderRole '../core/security/role.bicep' = { // Storage: Blob Data Contributor (write images container, deployment container) module storageBlobContributorRole '../core/security/role.bicep' = { scope: resourceGroup(storageResourceGroupName) - name: 'storage-blob-contributor-${uniqueString(principalId)}' + name: 'function-storage-blob-contributor-${uniqueString(principalId)}' params: { principalId: principalId roleDefinitionId: storageBlobDataContributorRoleId @@ -45,7 +45,7 @@ module storageBlobContributorRole '../core/security/role.bicep' = { // Storage: Queue Data Contributor (for AzureWebJobsStorage) module storageQueueContributorRole '../core/security/role.bicep' = { scope: resourceGroup(storageResourceGroupName) - name: 'storage-queue-contributor-${uniqueString(principalId)}' + name: 'function-storage-queue-contributor-${uniqueString(principalId)}' params: { principalId: principalId roleDefinitionId: storageQueueDataContributorRoleId @@ -56,7 +56,7 @@ module storageQueueContributorRole '../core/security/role.bicep' = { // Storage: Table Data Contributor (for AzureWebJobsStorage) module storageTableContributorRole '../core/security/role.bicep' = { scope: resourceGroup(storageResourceGroupName) - name: 'storage-table-contributor-${uniqueString(principalId)}' + name: 'function-storage-table-contributor-${uniqueString(principalId)}' params: { principalId: principalId roleDefinitionId: storageTableDataContributorRoleId @@ -67,7 +67,7 @@ module storageTableContributorRole '../core/security/role.bicep' = { // Search: Index Data Contributor (write chunks to index) module searchIndexContributorRole '../core/security/role.bicep' = { scope: resourceGroup(searchServiceResourceGroupName) - name: 'search-index-contributor-${uniqueString(principalId)}' + name: 'function-search-index-contributor-${uniqueString(principalId)}' params: { principalId: principalId roleDefinitionId: searchIndexDataContributorRoleId @@ -78,7 +78,7 @@ module searchIndexContributorRole '../core/security/role.bicep' = { // OpenAI: Cognitive Services OpenAI User module openAiUserRole '../core/security/role.bicep' = { scope: resourceGroup(openAiResourceGroupName) - name: 'openai-user-${uniqueString(principalId)}' + name: 'function-openai-user-${uniqueString(principalId)}' params: { principalId: principalId roleDefinitionId: cognitiveServicesOpenAIUserRoleId @@ -89,7 +89,7 @@ module openAiUserRole '../core/security/role.bicep' = { // Document Intelligence: Cognitive Services User module documentIntelligenceUserRole '../core/security/role.bicep' = { scope: resourceGroup(documentIntelligenceResourceGroupName) - name: 'doc-intelligence-user-${uniqueString(principalId)}' + name: 'function-doc-intelligence-user-${uniqueString(principalId)}' params: { principalId: principalId roleDefinitionId: cognitiveServicesUserRoleId @@ -100,7 +100,7 @@ module documentIntelligenceUserRole '../core/security/role.bicep' = { // Vision: Cognitive Services User (if multimodal) module visionUserRole '../core/security/role.bicep' = if (useMultimodal && !empty(visionServiceName)) { scope: resourceGroup(visionResourceGroupName) - name: 'vision-user-${uniqueString(principalId)}' + name: 'function-vision-user-${uniqueString(principalId)}' params: { principalId: principalId roleDefinitionId: cognitiveServicesUserRoleId @@ -111,7 +111,7 @@ module visionUserRole '../core/security/role.bicep' = if (useMultimodal && !empt // Content Understanding: Cognitive Services User (if multimodal) module contentUnderstandingUserRole '../core/security/role.bicep' = if (useMultimodal && !empty(contentUnderstandingServiceName)) { scope: resourceGroup(contentUnderstandingResourceGroupName) - name: 'content-understanding-user-${uniqueString(principalId)}' + name: 'function-content-understanding-user-${uniqueString(principalId)}' params: { principalId: principalId roleDefinitionId: cognitiveServicesUserRoleId @@ -121,7 +121,7 @@ module contentUnderstandingUserRole '../core/security/role.bicep' = if (useMulti // Application Insights: Monitoring Metrics Publisher module appInsightsMetricsPublisherRole '../core/security/role.bicep' = { - name: 'appinsights-metrics-${uniqueString(principalId)}' + name: 'function-appinsights-metrics-${uniqueString(principalId)}' params: { principalId: principalId roleDefinitionId: monitoringMetricsPublisherRoleId diff --git a/infra/app/functions.bicep b/infra/app/functions.bicep index ba1e8dd390..0a474adaa4 100644 --- a/infra/app/functions.bicep +++ b/infra/app/functions.bicep @@ -65,6 +65,7 @@ var runtimeStorageRoles = [ ] // Common app settings for both functions +// TODO: Take the settings from main.bicep - appEnvVars var commonAppSettings = { // Storage AZURE_STORAGE_ACCOUNT: storageAccountName @@ -111,6 +112,7 @@ var contentUnderstandingSettings = useMultimodal && !empty(contentUnderstandingS var allAppSettings = union(commonAppSettings, visionSettings, contentUnderstandingSettings) // Deployment storage containers +// TODO: Can we just use a boring name, the same for all functions? var documentExtractorDeploymentContainer = 'deploy-doc-extractor-${take(resourceToken, 7)}' var figureProcessorDeploymentContainer = 'deploy-figure-processor-${take(resourceToken, 7)}' var textProcessorDeploymentContainer = 'deploy-text-processor-${take(resourceToken, 7)}' @@ -174,10 +176,10 @@ resource textProcessorRuntimeStorage 'Microsoft.Storage/storageAccounts@2024-01- } resource documentExtractorRuntimeStorageRoles 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for role in runtimeStorageRoles: { - name: guid(documentExtractorRuntimeStorage.id, role.roleDefinitionId, 'doc-runtime') + name: guid(documentExtractorRuntimeStorage.id, role.roleDefinitionId, 'doc-storage-roles') scope: documentExtractorRuntimeStorage properties: { - principalId: documentExtractorIdentity.outputs.principalId + principalId: functionsUserIdentity.outputs.principalId principalType: 'ServicePrincipal' roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', role.roleDefinitionId) } @@ -187,10 +189,10 @@ resource documentExtractorRuntimeStorageRoles 'Microsoft.Authorization/roleAssig }] resource figureProcessorRuntimeStorageRoles 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for role in runtimeStorageRoles: { - name: guid(figureProcessorRuntimeStorage.id, role.roleDefinitionId, 'figure-runtime') + name: guid(figureProcessorRuntimeStorage.id, role.roleDefinitionId, 'figure-storage-roles') scope: figureProcessorRuntimeStorage properties: { - principalId: figureProcessorIdentity.outputs.principalId + principalId: functionsUserIdentity.outputs.principalId principalType: 'ServicePrincipal' roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', role.roleDefinitionId) } @@ -200,10 +202,10 @@ resource figureProcessorRuntimeStorageRoles 'Microsoft.Authorization/roleAssignm }] resource textProcessorRuntimeStorageRoles 'Microsoft.Authorization/roleAssignments@2022-04-01' = [for role in runtimeStorageRoles: { - name: guid(textProcessorRuntimeStorage.id, role.roleDefinitionId, 'text-runtime') + name: guid(textProcessorRuntimeStorage.id, role.roleDefinitionId, 'text-storage-roles') scope: textProcessorRuntimeStorage properties: { - principalId: textProcessorIdentity.outputs.principalId + principalId: functionsUserIdentity.outputs.principalId principalType: 'ServicePrincipal' roleDefinitionId: resourceId('Microsoft.Authorization/roleDefinitions', role.roleDefinitionId) } @@ -212,36 +214,6 @@ resource textProcessorRuntimeStorageRoles 'Microsoft.Authorization/roleAssignmen ] }] -// User-assigned managed identity for document extractor -module documentExtractorIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.4.1' = { - name: 'doc-extractor-identity' - params: { - location: location - tags: tags - name: '${abbrs.managedIdentityUserAssignedIdentities}doc-extractor-${resourceToken}' - } -} - -// User-assigned managed identity for text processor -module textProcessorIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.4.1' = { - name: 'text-processor-identity' - params: { - location: location - tags: tags - name: '${abbrs.managedIdentityUserAssignedIdentities}text-processor-${resourceToken}' - } -} - -// User-assigned managed identity for figure processor -module figureProcessorIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.4.1' = { - name: 'figure-processor-identity' - params: { - location: location - tags: tags - name: '${abbrs.managedIdentityUserAssignedIdentities}figure-processor-${resourceToken}' - } -} - // Flex Consumption supports only one Function App per plan; create a dedicated plan per ingestion function module documentExtractorPlan 'br/public:avm/res/web/serverfarm:0.1.1' = { name: 'doc-extractor-plan' @@ -285,15 +257,28 @@ module textProcessorPlan 'br/public:avm/res/web/serverfarm:0.1.1' = { } } + +module functionsUserIdentity 'br/public:avm/res/managed-identity/user-assigned-identity:0.4.1' = { + name: 'functions-user-identity' + params: { + location: location + tags: tags + name: 'functions-user-identity-${resourceToken}' + } +} + + + // Document Extractor Function App // App registration for document extractor (uses function identity principalId as FIC subject) module documentExtractorAppReg '../core/auth/appregistration.bicep' = { name: 'doc-extractor-appreg' params: { + appUniqueName: '${documentExtractorName}-appreg' cloudEnvironment: environment().name - webAppIdentityId: documentExtractorIdentity.outputs.principalId - clientAppName: 'skill-${documentExtractorName}' - clientAppDisplayName: 'skill-${documentExtractorName}' + webAppIdentityId: functionsUserIdentity.outputs.principalId + clientAppName: '${documentExtractorName}-app' + clientAppDisplayName: '${documentExtractorName} Entra App' issuer: openIdIssuer webAppEndpoint: 'https://${documentExtractorName}.azurewebsites.net' } @@ -309,18 +294,18 @@ module documentExtractor 'functions-app.bicep' = { appServicePlanId: documentExtractorPlan.outputs.resourceId runtimeName: 'python' runtimeVersion: '3.11' + identityId: functionsUserIdentity.outputs.resourceId + identityClientId: functionsUserIdentity.outputs.clientId + authClientId: documentExtractorAppReg.outputs.clientAppId + authIdentifierUri: documentExtractorAppReg.outputs.identifierUri + authTenantId: tenant().tenantId storageAccountName: documentExtractorRuntimeStorageName deploymentStorageContainerName: documentExtractorDeploymentContainer - identityId: documentExtractorIdentity.outputs.resourceId - identityClientId: documentExtractorIdentity.outputs.clientId appSettings: union(allAppSettings, { AzureFunctionsWebHost__hostid: documentExtractorHostId }) instanceMemoryMB: 4096 // High memory for document processing maximumInstanceCount: 100 - // Removed unused functionTimeout parameter; configured defaults via host settings - skillAppClientId: documentExtractorAppReg.outputs.clientAppId - skillAppAudience: 'api://${documentExtractorAppReg.outputs.clientAppId}' } dependsOn: [ documentExtractorRuntimeStorageAccount @@ -331,8 +316,9 @@ module documentExtractor 'functions-app.bicep' = { module figureProcessorAppReg '../core/auth/appregistration.bicep' = { name: 'figure-processor-appreg' params: { + appUniqueName: '${figureProcessorName}-app' cloudEnvironment: environment().name - webAppIdentityId: figureProcessorIdentity.outputs.principalId + webAppIdentityId: functionsUserIdentity.outputs.principalId clientAppName: 'skill-${figureProcessorName}' clientAppDisplayName: 'skill-${figureProcessorName}' issuer: openIdIssuer @@ -352,15 +338,16 @@ module figureProcessor 'functions-app.bicep' = { runtimeVersion: '3.11' storageAccountName: figureProcessorRuntimeStorageName deploymentStorageContainerName: figureProcessorDeploymentContainer - identityId: figureProcessorIdentity.outputs.resourceId - identityClientId: figureProcessorIdentity.outputs.clientId + identityId: functionsUserIdentity.outputs.resourceId + identityClientId: functionsUserIdentity.outputs.clientId + authClientId: figureProcessorAppReg.outputs.clientAppId + authIdentifierUri: figureProcessorAppReg.outputs.identifierUri + authTenantId: tenant().tenantId appSettings: union(allAppSettings, { AzureFunctionsWebHost__hostid: figureProcessorHostId }) instanceMemoryMB: 2048 maximumInstanceCount: 100 - skillAppClientId: figureProcessorAppReg.outputs.clientAppId - skillAppAudience: 'api://${figureProcessorAppReg.outputs.clientAppId}' } dependsOn: [ figureProcessorRuntimeStorageAccount @@ -371,8 +358,9 @@ module figureProcessor 'functions-app.bicep' = { module textProcessorAppReg '../core/auth/appregistration.bicep' = { name: 'text-processor-appreg' params: { + appUniqueName: '${textProcessorName}-app' cloudEnvironment: environment().name - webAppIdentityId: textProcessorIdentity.outputs.principalId + webAppIdentityId: functionsUserIdentity.outputs.principalId clientAppName: 'skill-${textProcessorName}' clientAppDisplayName: 'skill-${textProcessorName}' issuer: openIdIssuer @@ -392,15 +380,16 @@ module textProcessor 'functions-app.bicep' = { runtimeVersion: '3.11' storageAccountName: textProcessorRuntimeStorageName deploymentStorageContainerName: textProcessorDeploymentContainer - identityId: textProcessorIdentity.outputs.resourceId - identityClientId: textProcessorIdentity.outputs.clientId + identityId: functionsUserIdentity.outputs.resourceId + identityClientId: functionsUserIdentity.outputs.clientId + authClientId: textProcessorAppReg.outputs.clientAppId + authIdentifierUri: textProcessorAppReg.outputs.identifierUri + authTenantId: tenant().tenantId appSettings: union(allAppSettings, { AzureFunctionsWebHost__hostid: textProcessorHostId }) instanceMemoryMB: 2048 // Standard memory for embedding maximumInstanceCount: 100 - skillAppClientId: textProcessorAppReg.outputs.clientAppId - skillAppAudience: 'api://${textProcessorAppReg.outputs.clientAppId}' } dependsOn: [ textProcessorRuntimeStorageAccount @@ -408,27 +397,10 @@ module textProcessor 'functions-app.bicep' = { } // RBAC: Document Extractor Roles -module documentExtractorRbac 'functions-rbac.bicep' = { +module functionsIdentityRBAC 'functions-rbac.bicep' = { name: 'doc-extractor-rbac' params: { - principalId: documentExtractorIdentity.outputs.principalId - storageResourceGroupName: storageResourceGroupName - searchServiceResourceGroupName: searchServiceResourceGroupName - openAiResourceGroupName: openAiResourceGroupName - documentIntelligenceResourceGroupName: documentIntelligenceResourceGroupName - visionServiceName: visionServiceName - visionResourceGroupName: visionResourceGroupName - contentUnderstandingServiceName: contentUnderstandingServiceName - contentUnderstandingResourceGroupName: contentUnderstandingResourceGroupName - useMultimodal: useMultimodal - } -} - -// RBAC: Text Processor Roles -module textProcessorRbac 'functions-rbac.bicep' = { - name: 'text-processor-rbac' - params: { - principalId: textProcessorIdentity.outputs.principalId + principalId: functionsUserIdentity.outputs.principalId storageResourceGroupName: storageResourceGroupName searchServiceResourceGroupName: searchServiceResourceGroupName openAiResourceGroupName: openAiResourceGroupName @@ -441,48 +413,15 @@ module textProcessorRbac 'functions-rbac.bicep' = { } } -// RBAC: Figure Processor Roles -module figureProcessorRbac 'functions-rbac.bicep' = { - name: 'figure-processor-rbac' - params: { - principalId: figureProcessorIdentity.outputs.principalId - storageResourceGroupName: storageResourceGroupName - searchServiceResourceGroupName: searchServiceResourceGroupName - openAiResourceGroupName: openAiResourceGroupName - documentIntelligenceResourceGroupName: documentIntelligenceResourceGroupName - visionServiceName: visionServiceName - visionResourceGroupName: visionResourceGroupName - contentUnderstandingServiceName: contentUnderstandingServiceName - contentUnderstandingResourceGroupName: contentUnderstandingResourceGroupName - useMultimodal: useMultimodal - } -} // Outputs output documentExtractorName string = documentExtractor.outputs.name output documentExtractorUrl string = documentExtractor.outputs.defaultHostname -output documentExtractorIdentityPrincipalId string = documentExtractorIdentity.outputs.principalId -output documentExtractorClientAppId string = documentExtractorAppReg.outputs.clientAppId -output documentExtractorSkillResourceId string = documentExtractorAppReg.outputs.clientAppId output figureProcessorName string = figureProcessor.outputs.name output figureProcessorUrl string = figureProcessor.outputs.defaultHostname -output figureProcessorIdentityPrincipalId string = figureProcessorIdentity.outputs.principalId -output figureProcessorClientAppId string = figureProcessorAppReg.outputs.clientAppId -output figureProcessorSkillResourceId string = figureProcessorAppReg.outputs.clientAppId output textProcessorName string = textProcessor.outputs.name output textProcessorUrl string = textProcessor.outputs.defaultHostname -output textProcessorIdentityPrincipalId string = textProcessorIdentity.outputs.principalId -output textProcessorClientAppId string = textProcessorAppReg.outputs.clientAppId -output textProcessorSkillResourceId string = textProcessorAppReg.outputs.clientAppId -output documentExtractorRuntimeStorageName string = documentExtractorRuntimeStorageName -output figureProcessorRuntimeStorageName string = figureProcessorRuntimeStorageName -output textProcessorRuntimeStorageName string = textProcessorRuntimeStorageName -output documentExtractorHostId string = documentExtractorHostId -output figureProcessorHostId string = figureProcessorHostId -output textProcessorHostId string = textProcessorHostId -// Output the last plan id (text processor) for potential diagnostics; others can be added if needed -output appServicePlanId string = textProcessorPlan.outputs.resourceId // Resource IDs for each function app (used for auth_resource_id with managed identity secured skills) -output documentExtractorResourceId string = documentExtractor.outputs.resourceId -output figureProcessorResourceId string = figureProcessor.outputs.resourceId -output textProcessorResourceId string = textProcessor.outputs.resourceId +output documentExtractorAuthIdentifierUri string = documentExtractorAppReg.outputs.identifierUri +output figureProcessorAuthIdentifierUri string = figureProcessorAppReg.outputs.identifierUri +output textProcessorAuthIdentifierUri string = textProcessorAppReg.outputs.identifierUri diff --git a/infra/core/auth/appregistration.bicep b/infra/core/auth/appregistration.bicep index 3bf29ce701..baf8076de3 100644 --- a/infra/core/auth/appregistration.bicep +++ b/infra/core/auth/appregistration.bicep @@ -3,6 +3,9 @@ extension microsoftGraphV1 @description('Specifies the name of cloud environment to run this deployment in.') param cloudEnvironment string = environment().name +@description('The unique name for the application registration (used for idempotency)') +param appUniqueName string + // NOTE: Microsoft Graph Bicep file deployment is only supported in Public Cloud @description('Audience uris for public and national clouds') param audiences object = { @@ -32,60 +35,82 @@ param clientAppName string @description('Specifies the display name for the client application') param clientAppDisplayName string -@description('Specifies the scopes that the client application requires.') -param clientAppScopes array = ['User.Read', 'offline_access', 'openid', 'profile'] - param serviceManagementReference string = '' param issuer string param webAppEndpoint string -// Get the MS Graph Service Principal based on its application ID: -// https://learn.microsoft.com/troubleshoot/entra/entra-id/governance/verify-first-party-apps-sign-in -var msGraphAppId = '00000003-0000-0000-c000-000000000000' -resource msGraphSP 'Microsoft.Graph/servicePrincipals@v1.0' existing = { - appId: msGraphAppId +// Combine default scope with custom scopes +var defaultScopeValue = 'user_impersonation' +var defaultScopeId = guid(appUniqueName, 'default-scope', defaultScopeValue) + +var userImpersonationScope = { + adminConsentDescription: 'Allow the application to access the API on behalf of the signed-in user' + adminConsentDisplayName: 'Access application as user' + id: defaultScopeId + isEnabled: true + type: 'User' + userConsentDescription: 'Allow the application to access the API on behalf of the signed-in user' + userConsentDisplayName: 'Access application as user' + value: defaultScopeValue } -var graphScopes = msGraphSP.oauth2PermissionScopes -resource clientApp 'Microsoft.Graph/applications@v1.0' = { +var allScopes = [ + userImpersonationScope +] + +// Is this going to work with search service? Otherwise we have to set behind the scene? +var identifierUri = 'api://${appUniqueName}-${uniqueString(subscription().id, resourceGroup().id, appUniqueName)}' + +resource appRegistration 'Microsoft.Graph/applications@v1.0' = { uniqueName: clientAppName displayName: clientAppDisplayName signInAudience: 'AzureADMyOrg' serviceManagementReference: empty(serviceManagementReference) ? null : serviceManagementReference + identifierUris: [identifierUri] + api: { + oauth2PermissionScopes: allScopes + requestedAccessTokenVersion: 2 + // Not doing preauthorized apps + } web: { redirectUris: [ - 'http://localhost:50505/.auth/login/aad/callback' '${webAppEndpoint}/.auth/login/aad/callback' ] implicitGrantSettings: { enableIdTokenIssuance: true } } requiredResourceAccess: [ - { - resourceAppId: msGraphAppId + { + // Microsoft Graph permissions + resourceAppId: '00000003-0000-0000-c000-000000000000' resourceAccess: [ - for (scope, i) in clientAppScopes: { - id: filter(graphScopes, graphScopes => graphScopes.value == scope)[0].id + { + // User.Read delegated permission + id: 'e1fe6dd8-ba31-4d61-89e7-88639da4683d' type: 'Scope' } ] } ] - resource clientAppFic 'federatedIdentityCredentials@v1.0' = { - name: '${clientApp.uniqueName}/miAsFic' - audiences: [ - audiences[cloudEnvironment].uri - ] - issuer: issuer - subject: webAppIdentityId - } } -resource clientSp 'Microsoft.Graph/servicePrincipals@v1.0' = { - appId: clientApp.appId +resource appServicePrincipal 'Microsoft.Graph/servicePrincipals@v1.0' = { + appId: appRegistration.appId } -output clientAppId string = clientApp.appId -output clientSpId string = clientSp.id +resource federatedIdentityCredential 'Microsoft.Graph/applications/federatedIdentityCredentials@v1.0' = { + name: '${appRegistration.uniqueName}/miAsFic' + audiences: [ + audiences[cloudEnvironment].uri + ] + issuer: issuer + subject: webAppIdentityId +} + +output clientAppId string = appRegistration.appId +output clientSpId string = appServicePrincipal.id + +@description('The identifier URI of the application - returns the actual URI that was set') +output identifierUri string = identifierUri diff --git a/infra/core/auth/appupdate.bicep b/infra/core/auth/appupdate.bicep deleted file mode 100644 index 74c2e87b1b..0000000000 --- a/infra/core/auth/appupdate.bicep +++ /dev/null @@ -1,42 +0,0 @@ -param appServiceName string - -@description('The client ID of the Microsoft Entra application.') -param clientId string - -param openIdIssuer string - -resource appService 'Microsoft.Web/sites@2022-03-01' existing = { - name: appServiceName -} - -resource configAuth 'Microsoft.Web/sites/config@2022-03-01' = { - parent: appService - name: 'authsettingsV2' - properties: { - globalValidation: { - requireAuthentication: true - unauthenticatedClientAction: 'RedirectToLoginPage' - redirectToProvider: 'azureactivedirectory' - } - identityProviders: { - azureActiveDirectory: { - enabled: true - registration: { - clientId: clientId - clientSecretSettingName: 'OVERRIDE_USE_MI_FIC_ASSERTION_CLIENTID' - openIdIssuer: openIdIssuer - } - validation: { - defaultAuthorizationPolicy: { - allowedApplications: [] - } - } - } - } - login: { - tokenStore: { - enabled: true - } - } - } -} diff --git a/infra/main.bicep b/infra/main.bicep index 1a50e42596..ad9fa08eaf 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -543,9 +543,9 @@ var appEnvVariables = { FIGURE_PROCESSOR_SKILL_ENDPOINT: useCloudIngestion ? 'https://${functions!.outputs.figureProcessorUrl}/api/process' : '' TEXT_PROCESSOR_SKILL_ENDPOINT: useCloudIngestion ? 'https://${functions!.outputs.textProcessorUrl}/api/process' : '' // Skill audience identifier URI from registration module (created below) - DOCUMENT_EXTRACTOR_SKILL_RESOURCE_ID: useCloudIngestion ? functions!.outputs.documentExtractorSkillResourceId : '' - FIGURE_PROCESSOR_SKILL_RESOURCE_ID: useCloudIngestion ? functions!.outputs.figureProcessorSkillResourceId : '' - TEXT_PROCESSOR_SKILL_RESOURCE_ID: useCloudIngestion ? functions!.outputs.textProcessorSkillResourceId : '' + DOCUMENT_EXTRACTOR_SKILL_AUTH_RESOURCE_ID: useCloudIngestion ? functions!.outputs.documentExtractorAuthIdentifierUri : '' + FIGURE_PROCESSOR_SKILL_AUTH_RESOURCE_ID: useCloudIngestion ? functions!.outputs.figureProcessorAuthIdentifierUri : '' + TEXT_PROCESSOR_SKILL_AUTH_RESOURCE_ID: useCloudIngestion ? functions!.outputs.textProcessorAuthIdentifierUri : '' } // App Service for the web application (Python Quart app with JS frontend) @@ -1509,9 +1509,9 @@ output DOCUMENT_EXTRACTOR_SKILL_ENDPOINT string = useCloudIngestion ? 'https://$ output FIGURE_PROCESSOR_SKILL_ENDPOINT string = useCloudIngestion ? 'https://${functions!.outputs.figureProcessorUrl}/api/process' : '' output TEXT_PROCESSOR_SKILL_ENDPOINT string = useCloudIngestion ? 'https://${functions!.outputs.textProcessorUrl}/api/process' : '' // Identifier URI used as authResourceId for all custom skill endpoints -output DOCUMENT_EXTRACTOR_SKILL_RESOURCE_ID string = useCloudIngestion ? functions!.outputs.documentExtractorSkillResourceId : '' -output FIGURE_PROCESSOR_SKILL_RESOURCE_ID string = useCloudIngestion ? functions!.outputs.figureProcessorSkillResourceId : '' -output TEXT_PROCESSOR_SKILL_RESOURCE_ID string = useCloudIngestion ? functions!.outputs.textProcessorSkillResourceId : '' +output DOCUMENT_EXTRACTOR_SKILL_AUTH_RESOURCE_ID string = useCloudIngestion ? functions!.outputs.documentExtractorAuthIdentifierUri : '' +output FIGURE_PROCESSOR_SKILL_AUTH_RESOURCE_ID string = useCloudIngestion ? functions!.outputs.figureProcessorAuthIdentifierUri : '' +output TEXT_PROCESSOR_SKILL_AUTH_RESOURCE_ID string = useCloudIngestion ? functions!.outputs.textProcessorAuthIdentifierUri : '' output AZURE_AI_PROJECT string = useAiProject ? ai.outputs.projectName : '' From d8dd729ab1ae0f9ababdd2adf82cf11eb2f2a81d Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Wed, 5 Nov 2025 14:48:52 -0800 Subject: [PATCH 10/30] latest changes to get auth working --- app/backend/prepdocs.py | 6 +-- .../prepdocslib/cloudingestionstrategy.py | 33 +++++++------ .../document_extractor/function_app.py | 4 +- app/functions/document_extractor/host.json | 7 +++ .../figure_processor/function_app.py | 5 +- app/functions/figure_processor/host.json | 7 +++ app/functions/text_processor/function_app.py | 46 ++++++++++++------- app/functions/text_processor/host.json | 7 +++ infra/app/functions-app.bicep | 40 +++++++++++++++- 9 files changed, 114 insertions(+), 41 deletions(-) diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index c1e68d318d..5e652ddb21 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -461,11 +461,11 @@ async def main(strategy: Strategy, setup_index: bool = True): raise ValueError("Cloud ingestion requires Azure OpenAI embeddings to configure the search index.") document_extractor_uri = require_env_var("DOCUMENT_EXTRACTOR_SKILL_ENDPOINT") - document_extractor_resource_id = require_env_var("DOCUMENT_EXTRACTOR_SKILL_RESOURCE_ID") + document_extractor_resource_id = require_env_var("DOCUMENT_EXTRACTOR_SKILL_AUTH_RESOURCE_ID") figure_processor_uri = require_env_var("FIGURE_PROCESSOR_SKILL_ENDPOINT") - figure_processor_resource_id = require_env_var("FIGURE_PROCESSOR_SKILL_RESOURCE_ID") + figure_processor_resource_id = require_env_var("FIGURE_PROCESSOR_SKILL_AUTH_RESOURCE_ID") text_processor_uri = require_env_var("TEXT_PROCESSOR_SKILL_ENDPOINT") - text_processor_resource_id = require_env_var("TEXT_PROCESSOR_SKILL_RESOURCE_ID") + text_processor_resource_id = require_env_var("TEXT_PROCESSOR_SKILL_AUTH_RESOURCE_ID") search_embedding_field = require_env_var("AZURE_SEARCH_FIELD_NAME_EMBEDDING") ingestion_strategy = CloudIngestionStrategy( diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py index 80df119400..b706433140 100644 --- a/app/backend/prepdocslib/cloudingestionstrategy.py +++ b/app/backend/prepdocslib/cloudingestionstrategy.py @@ -10,6 +10,8 @@ NativeBlobSoftDeleteDeletionDetectionPolicy, ) from azure.search.documents.indexes.models import ( + IndexingParameters, + IndexingParametersConfiguration, IndexProjectionMode, InputFieldMappingEntry, OutputFieldMappingEntry, @@ -102,33 +104,23 @@ def __init__( self.indexer_name = f"{prefix}-indexer" self.data_source_name = f"{prefix}-blob" - def _ensure_default_scope(val: str) -> str: - # If already ends with '/.default' keep as-is. - if val.endswith("/.default"): - return val - # If already contains '.default' (rare variant) keep. - if val.endswith(".default"): - return val - # Append '/.default' consistently (works for both raw appId and api://appId forms). - return f"{val}/.default" - self.document_extractor = _SkillConfig( name=f"{prefix}-document-extractor-skill", description="Custom skill that downloads and parses source documents", uri=document_extractor_uri, - auth_resource_id=_ensure_default_scope(document_extractor_auth_resource_id), + auth_resource_id=document_extractor_auth_resource_id, ) self.figure_processor = _SkillConfig( name=f"{prefix}-figure-processor-skill", description="Custom skill that enriches individual figures", uri=figure_processor_uri, - auth_resource_id=_ensure_default_scope(figure_processor_auth_resource_id), + auth_resource_id=figure_processor_auth_resource_id, ) self.text_processor = _SkillConfig( name=f"{prefix}-text-processor-skill", description="Custom skill that merges figures, chunks text, and generates embeddings", uri=text_processor_uri, - auth_resource_id=_ensure_default_scope(text_processor_auth_resource_id), + auth_resource_id=text_processor_auth_resource_id, ) self._search_manager: SearchManager | None = None @@ -166,12 +158,10 @@ def _build_document_extractor_skill(self) -> WebApiSkill: # Managed identity: Search service authenticates against the function app using this resource ID. auth_resource_id=self.document_extractor.auth_resource_id, inputs=[ - InputFieldMappingEntry(name="blobUrl", source="/document/metadata_storage_path"), + # Provide the binary payload expected by the document extractor custom skill. + InputFieldMappingEntry(name="file_data", source="/document/file_data"), InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"), InputFieldMappingEntry(name="content_type", source="/document/metadata_storage_content_type"), - InputFieldMappingEntry( - name="metadata_storage_sas_token", source="/document/metadata_storage_sas_token" - ), ], outputs=outputs, ) @@ -310,6 +300,15 @@ async def run(self) -> None: data_source_name=self.data_source_name, target_index_name=self.search_info.index_name, skillset_name=self.skillset_name, + parameters=IndexingParameters( + configuration=IndexingParametersConfiguration( + query_timeout=None, + # markdown_parsing_submode=None, + data_to_extract="contentAndMetadata", + # markdown_header_depth=None, + allow_skillset_to_read_file_data=True, + ) + ), ) async with self.search_info.create_search_indexer_client() as indexer_client: diff --git a/app/functions/document_extractor/function_app.py b/app/functions/document_extractor/function_app.py index 6b89a309c0..86324e9b53 100644 --- a/app/functions/document_extractor/function_app.py +++ b/app/functions/document_extractor/function_app.py @@ -17,7 +17,7 @@ from prepdocslib.ingestionhelpers import select_parser from prepdocslib.page import Page -app = func.FunctionApp() +app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS) logger = logging.getLogger(__name__) @@ -38,7 +38,7 @@ @app.function_name(name="extract") -@app.route(route="extract", methods=["POST"]) +@app.route(route="extract", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS) async def extract_document(req: func.HttpRequest) -> func.HttpResponse: """ Azure Search Custom Skill: Extract document content diff --git a/app/functions/document_extractor/host.json b/app/functions/document_extractor/host.json index 20f502bb31..c00cc23f37 100644 --- a/app/functions/document_extractor/host.json +++ b/app/functions/document_extractor/host.json @@ -1,5 +1,12 @@ { "version": "2.0", + "extensions": { + "mcp": { + "system": { + "webhookAuthorizationLevel": "anonymous" + } + } + }, "extensionBundle": { "id": "Microsoft.Azure.Functions.ExtensionBundle", "version": "[4.*, 5.0.0)" diff --git a/app/functions/figure_processor/function_app.py b/app/functions/figure_processor/function_app.py index 76eda4715d..aa82a8b2b2 100644 --- a/app/functions/figure_processor/function_app.py +++ b/app/functions/figure_processor/function_app.py @@ -31,7 +31,8 @@ setup_openai_client, ) -app = func.FunctionApp() +# Mark the function as anonymous since we are protecting it with built-in auth instead +app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS) logger = logging.getLogger(__name__) @@ -118,7 +119,7 @@ @app.function_name(name="process_figure") -@app.route(route="process", methods=["POST"]) +@app.route(route="process", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS) async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse: """Entrypoint for Azure Search custom skill calls.""" diff --git a/app/functions/figure_processor/host.json b/app/functions/figure_processor/host.json index ae5ca6fb09..d26b61f2a8 100644 --- a/app/functions/figure_processor/host.json +++ b/app/functions/figure_processor/host.json @@ -1,5 +1,12 @@ { "version": "2.0", + "extensions": { + "mcp": { + "system": { + "webhookAuthorizationLevel": "anonymous" + } + } + }, "extensionBundle": { "id": "Microsoft.Azure.Functions.ExtensionBundle", "version": "[4.*, 5.0.0)" diff --git a/app/functions/text_processor/function_app.py b/app/functions/text_processor/function_app.py index 6510c60940..0eff4c5f76 100644 --- a/app/functions/text_processor/function_app.py +++ b/app/functions/text_processor/function_app.py @@ -21,7 +21,8 @@ from prepdocslib.textprocessor import process_text from prepdocslib.textsplitter import SentenceTextSplitter -app = func.FunctionApp() +# Mark the function as anonymous since we are protecting it with built-in auth instead +app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS) logger = logging.getLogger(__name__) @@ -33,7 +34,7 @@ AZURE_OPENAI_EMB_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT", "") AZURE_OPENAI_EMB_MODEL_NAME = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-3-large") AZURE_OPENAI_EMB_DIMENSIONS = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", "3072")) -AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "") +AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-06-01") GLOBAL_CREDENTIAL: ManagedIdentityCredential | None EMBEDDING_SERVICE: AzureOpenAIEmbeddingService | None @@ -84,7 +85,7 @@ @app.function_name(name="process_text") -@app.route(route="process", methods=["POST"]) +@app.route(route="process", methods=["POST"], auth_level=func.AuthLevel.ANONYMOUS) async def process_text_entry(req: func.HttpRequest) -> func.HttpResponse: """Azure Search custom skill entry point for chunking and embeddings.""" @@ -169,7 +170,7 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: if not figure_payload: logger.warning("Figure ID %s not found in figures metadata for page %d", fid, page_num) continue - image_on_page = ImageOnPage.from_skill_payload(figure_payload) + image_on_page, _ = ImageOnPage.from_skill_payload(figure_payload) page_obj.images.append(image_on_page) pages.append(page_obj) @@ -202,7 +203,7 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: content = section.chunk.text.strip() if not content: continue - embedding_vec = embeddings[idx] if embeddings else [] + embedding_vec = embeddings[idx] if embeddings else None image_refs: list[dict[str, Any]] = [] for image in section.chunk.images: ref = { @@ -216,16 +217,29 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: if USE_MULTIMODAL and image.embedding is not None: ref["imageEmbedding"] = image.embedding image_refs.append(ref) - outputs.append( - { - "id": f"{normalized_id}-{idx:04d}", - "content": content, - "embedding": embedding_vec, - "sourcepage": BlobManager.sourcepage_from_file_page(file_name, section.chunk.page_num), - "sourcefile": file_name, - "parent_id": storage_url, - **({"images": image_refs} if image_refs else {}), - } - ) + chunk_entry: dict[str, Any] = { + "id": f"{normalized_id}-{idx:04d}", + "content": content, + "sourcepage": BlobManager.sourcepage_from_file_page(file_name, section.chunk.page_num), + "sourcefile": file_name, + "parent_id": storage_url, + **({"images": image_refs} if image_refs else {}), + } + + if embedding_vec is not None: + if len(embedding_vec) == AZURE_OPENAI_EMB_DIMENSIONS: + chunk_entry["embedding"] = embedding_vec + else: + logger.warning( + "Skipping embedding for %s chunk %d due to dimension mismatch (expected %d, got %d)", + file_name, + idx, + AZURE_OPENAI_EMB_DIMENSIONS, + len(embedding_vec), + ) + elif USE_VECTORS: + logger.warning("Embeddings were requested but missing for %s chunk %d", file_name, idx) + + outputs.append(chunk_entry) return outputs diff --git a/app/functions/text_processor/host.json b/app/functions/text_processor/host.json index 656342971e..d6205f876a 100644 --- a/app/functions/text_processor/host.json +++ b/app/functions/text_processor/host.json @@ -1,5 +1,12 @@ { "version": "2.0", + "extensions": { + "mcp": { + "system": { + "webhookAuthorizationLevel": "anonymous" + } + } + }, "extensionBundle": { "id": "Microsoft.Azure.Functions.ExtensionBundle", "version": "[4.*, 5.0.0)" diff --git a/infra/app/functions-app.bicep b/infra/app/functions-app.bicep index cd660f5ea9..31f1d727b9 100644 --- a/infra/app/functions-app.bicep +++ b/infra/app/functions-app.bicep @@ -49,6 +49,7 @@ var baseAppSettings = { AzureWebJobsStorage__queueServiceUri: stg.properties.primaryEndpoints.queue AzureWebJobsStorage__tableServiceUri: stg.properties.primaryEndpoints.table FUNCTIONS_EXTENSION_VERSION: '~4' + AZURE_CLIENT_ID: identityClientId } // Optional Application Insights settings @@ -59,6 +60,8 @@ var appInsightsSettings = !empty(applicationInsightsName) ? { var easyAuthSettings = { OVERRIDE_USE_MI_FIC_ASSERTION_CLIENTID: identityClientId + WEBSITE_AUTH_PRM_DEFAULT_WITH_SCOPES: '${authIdentifierUri}/user_impersonation' + WEBSITE_AUTH_AAD_ALLOWED_TENANTS: authTenantId } // Merge all app settings @@ -124,6 +127,15 @@ resource auth 'Microsoft.Web/sites/config@2022-03-01' = { unauthenticatedClientAction: 'Return401' redirectToProvider: 'azureactivedirectory' } + httpSettings: { + requireHttps: true + routes: { + apiPrefix: '/.auth' + } + forwardProxy: { + convention: 'NoProxy' + } + } identityProviders: { azureActiveDirectory: { enabled: true @@ -139,10 +151,36 @@ resource auth 'Microsoft.Web/sites/config@2022-03-01' = { ] defaultAuthorizationPolicy: { allowedPrincipals: {} - allowedApplications: [authClientId] + allowedApplications: null // TODO: Restrict to AI Search App } } + isAutoProvisioned: false + } + } + login: { + routes: { + logoutEndpoint: '/.auth/logout' } + tokenStore: { + enabled: true + tokenRefreshExtensionHours: 72 + fileSystem: {} + azureBlobStorage: {} + } + preserveUrlFragmentsForLogins: false + allowedExternalRedirectUrls: [] + cookieExpiration: { + convention: 'FixedTime' + timeToExpiration: '08:00:00' + } + nonce: { + validateNonce: true + nonceExpirationInterval: '00:05:00' + } + } + platform: { + enabled: true + runtimeVersion: '~1' } } } From e906fb5bc9cda6d7c73f8625994d4d42093958c5 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Fri, 7 Nov 2025 07:06:23 -0800 Subject: [PATCH 11/30] Fix tests --- tests/test_app_config.py | 4 +--- tests/test_pdfparser.py | 16 +++++++--------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/tests/test_app_config.py b/tests/test_app_config.py index 2d89108e85..00edaf6290 100644 --- a/tests/test_app_config.py +++ b/tests/test_app_config.py @@ -264,9 +264,7 @@ async def test_app_config_user_upload_bad_openai_config(monkeypatch, minimal_env monkeypatch.setenv("USE_USER_UPLOAD", "true") monkeypatch.setenv("OPENAI_HOST", "openai") quart_app = app.create_app() - with pytest.raises( - quart.testing.app.LifespanError, match="OpenAI key is required when using the non-Azure OpenAI API" - ): + with pytest.raises(quart.testing.app.LifespanError, match="OPENAI_API_KEY is required for public OpenAI host"): async with quart_app.test_app() as test_app: test_app.test_client() diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py index 5ed36bdeb0..c006a392a5 100644 --- a/tests/test_pdfparser.py +++ b/tests/test_pdfparser.py @@ -112,10 +112,10 @@ def test_table_to_html_with_spans(): @pytest.mark.asyncio async def test_process_figure_without_bounding_regions(): figure = DocumentFigure(id="1", caption=None, bounding_regions=None) - result = await DocumentAnalysisParser.process_figure(None, figure) + result = await DocumentAnalysisParser.figure_to_image(None, figure) assert isinstance(result, ImageOnPage) - assert result.description == "" + assert result.description is None assert result.title == "" assert result.figure_id == "1" assert result.page_num == 0 @@ -143,10 +143,10 @@ def mock_crop_image_from_pdf_page(doc, page_number, bounding_box): monkeypatch.setattr(DocumentAnalysisParser, "crop_image_from_pdf_page", mock_crop_image_from_pdf_page) with caplog.at_level(logging.WARNING): - result = await DocumentAnalysisParser.process_figure(doc, figure) + result = await DocumentAnalysisParser.figure_to_image(doc, figure) assert isinstance(result, ImageOnPage) - assert result.description == "" + assert result.description is None assert result.title == "Logo" assert result.bytes == b"image_bytes" assert result.page_num == 0 @@ -294,8 +294,7 @@ async def mock_poller_result(): monkeypatch.setattr(mock_poller, "result", mock_poller_result) parser = DocumentAnalysisParser( - endpoint="https://example.com", - credential=MockAzureCredential(), + endpoint="https://example.com", credential=MockAzureCredential(), process_figures=True ) with open(TEST_DATA_DIR / "Simple Figure.pdf", "rb") as f: @@ -357,14 +356,13 @@ async def mock_poller_result(): monkeypatch.setattr(mock_poller, "result", mock_poller_result) parser = DocumentAnalysisParser( - endpoint="https://example.com", - credential=MockAzureCredential(), + endpoint="https://example.com", credential=MockAzureCredential(), process_figures=True ) content = io.BytesIO(b"pdf content bytes") content.name = "test.docx" with caplog.at_level(logging.ERROR): pages = [page async for page in parser.parse(content)] - assert "does not support high-resolution figure extraction" in caplog.text + assert "does not support media description." in caplog.text assert len(pages) == 1 assert pages[0].page_num == 0 From f7638d4fa4fba23444f596de16ac33ed6ba402c1 Mon Sep 17 00:00:00 2001 From: Matt Gotteiner Date: Sat, 8 Nov 2025 17:01:19 -0600 Subject: [PATCH 12/30] always upload local files --- app/backend/prepdocslib/blobmanager.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index 49e3a5a9f3..fe8d2d56fb 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -424,7 +424,8 @@ async def upload_blob(self, file: File) -> str: await container_client.create_container() # Re-open and upload the original file - if file.url is None: + # URL may be a path to a local file or already set to a blob URL + if file.url is None or os.path.exists(file.url): with open(file.content.name, "rb") as reopened_file: blob_name = self.blob_name_from_file_name(file.content.name) logger.info("Uploading blob for document '%s'", blob_name) From ba1a99760588ec7a040525bc91b1a14e769a1f0e Mon Sep 17 00:00:00 2001 From: Matt Gotteiner Date: Sat, 8 Nov 2025 17:08:32 -0600 Subject: [PATCH 13/30] update to storageMetadata extraction --- app/backend/prepdocslib/cloudingestionstrategy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py index b706433140..d313b7905a 100644 --- a/app/backend/prepdocslib/cloudingestionstrategy.py +++ b/app/backend/prepdocslib/cloudingestionstrategy.py @@ -304,7 +304,7 @@ async def run(self) -> None: configuration=IndexingParametersConfiguration( query_timeout=None, # markdown_parsing_submode=None, - data_to_extract="contentAndMetadata", + data_to_extract="storageMetadata", # markdown_header_depth=None, allow_skillset_to_read_file_data=True, ) From 628609a84e7543e3ef5bce1aa09072a6fb5ca47d Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 10 Nov 2025 09:24:34 -0800 Subject: [PATCH 14/30] Got it working --- AGENTS.md | 4 + app/backend/prepdocslib/blobmanager.py | 2 +- app/backend/prepdocslib/cloud_vs_local.txt | 53 ++++ .../prepdocslib/cloudingestionstrategy.py | 229 +++++++++--------- app/backend/prepdocslib/filestrategy.py | 4 +- app/backend/prepdocslib/page.py | 21 +- app/backend/prepdocslib/textprocessor.py | 4 +- .../figure_processor/function_app.py | 20 +- app/functions/text_processor/function_app.py | 73 +++++- docs/cloud_ingestion.md | 141 +++++++++-- 10 files changed, 393 insertions(+), 158 deletions(-) create mode 100644 app/backend/prepdocslib/cloud_vs_local.txt diff --git a/AGENTS.md b/AGENTS.md index a4c85529cb..7ad6d26b84 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -116,3 +116,7 @@ cd scripts && mypy . --config-file=../pyproject.toml Note that we do not currently enforce type hints in the tests folder, as it would require adding a lot of `# type: ignore` comments to the existing tests. We only enforce type hints in the main application code and scripts. + +## Python code style + +Do not use single underscores in front of "private" methods or variables in Python code. We do not follow that convention in this codebase, since this is an application and not a library. diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index fe8d2d56fb..a4135baa47 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -450,7 +450,7 @@ async def upload_document_image( raise ValueError( "user_oid is not supported for BlobManager. Use AdlsBlobManager for user-specific operations." ) - container_client = self.blob_service_client.get_container_client(self.container) + container_client = self.blob_service_client.get_container_client(self.image_container) if not await container_client.exists(): await container_client.create_container() image_bytes = self.add_image_citation(image_bytes, document_filename, image_filename, image_page_num) diff --git a/app/backend/prepdocslib/cloud_vs_local.txt b/app/backend/prepdocslib/cloud_vs_local.txt new file mode 100644 index 0000000000..bd1d2a67ec --- /dev/null +++ b/app/backend/prepdocslib/cloud_vs_local.txt @@ -0,0 +1,53 @@ +search query: + +{ + "search": "*", + "count": true, + "filter": "sourcepage eq 'Northwind_Health_Plus_Benefits_Details.pdf#page=1'", + "facets": ["sourcepage"] +} + +incorrect = { + "@search.score": 0.5914636, + "id": "7f5de0fafb88_aHR0cHM6Ly9zdGQ0Z2ZiYWpuM2UzeXUuYmxvYi5jb3JlLndpbmRvd3MubmV0L2NvbnRlbnQvTm9ydGh3aW5kX0hlYWx0aF9QbHVzX0JlbmVmaXRzX0RldGFpbHMucGRm0_chunks_0", + "content": "# Contoso Electronics\n\nNorthwind Health Plus Plan\n\n\n
", + "category": null, + "sourcepage": "Northwind_Health_Plus_Benefits_Details.pdf#page=1", + "sourcefile": "Northwind_Health_Plus_Benefits_Details.pdf", + "storageUrl": "https://std4gfbajn3e3yu.blob.core.windows.net/content/Northwind_Health_Plus_Benefits_Details.pdf", + "parent_id": "aHR0cHM6Ly9zdGQ0Z2ZiYWpuM2UzeXUuYmxvYi5jb3JlLndpbmRvd3MubmV0L2NvbnRlbnQvTm9ydGh3aW5kX0hlYWx0aF9QbHVzX0JlbmVmaXRzX0RldGFpbHMucGRm0", + "images": [ + { + "url": "", + "description": "", + "boundingbox": [ + 373.97, + 620.07, + 575.79, + 701.93 + ] + } + ] + } + +correct = { + "@search.score": 1, + "id": "file-Northwind_Health_Plus_Benefits_Details_pdf-4E6F72746877696E645F4865616C74685F506C75735F42656E65666974735F44657461696C732E706466-page-0", + "content": "# Contoso Electronics\n\nNorthwind Health Plus Plan\n\n\nThe image shows a logo consisting of a graphic and text. The graphic is a stylized blue drone with four propellers depicted by four curving lines at the ends of its arms. To the right of the drone graphic, there is the text \"Contoso Electronics\" written in black. The text is aligned vertically with \"Contoso\" on top and \"Electronics\" below it. The overall design is simple and clean.", + "category": null, + "sourcepage": "Northwind_Health_Plus_Benefits_Details.pdf#page=1", + "sourcefile": "Northwind_Health_Plus_Benefits_Details.pdf", + "storageUrl": "https://std4gfbajn3e3yu.blob.core.windows.net/content/Northwind_Health_Plus_Benefits_Details.pdf", + "images": [ + { + "url": "https://std4gfbajn3e3yu.blob.core.windows.net/images/Northwind_Health_Plus_Benefits_Details.pdf/page0/figure1_1.png", + "description": "The image shows a logo consisting of a graphic and text. The graphic is a stylized blue drone with four propellers depicted by four curving lines at the ends of its arms. To the right of the drone graphic, there is the text \"Contoso Electronics\" written in black. The text is aligned vertically with \"Contoso\" on top and \"Electronics\" below it. The overall design is simple and clean.", + "boundingbox": [ + 373.97, + 620.07, + 575.79, + 701.93 + ] + } + ] + } diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py index d313b7905a..fd9b5ce25f 100644 --- a/app/backend/prepdocslib/cloudingestionstrategy.py +++ b/app/backend/prepdocslib/cloudingestionstrategy.py @@ -1,7 +1,5 @@ """Cloud ingestion strategy using Azure AI Search custom skills.""" -from __future__ import annotations - import logging from dataclasses import dataclass from datetime import timedelta @@ -23,6 +21,7 @@ SearchIndexerIndexProjectionSelector, SearchIndexerIndexProjectionsParameters, SearchIndexerSkillset, + ShaperSkill, WebApiSkill, ) @@ -34,12 +33,12 @@ logger = logging.getLogger("scripts") -_DEFAULT_TIMEOUT = timedelta(seconds=230) -_DEFAULT_BATCH_SIZE = 1 +DEFAULT_SKILL_TIMEOUT = timedelta(seconds=230) +DEFAULT_BATCH_SIZE = 1 @dataclass(slots=True) -class _SkillConfig: +class SkillConfig: """Configuration for a custom Web API skill.""" name: str @@ -72,21 +71,6 @@ def __init__( use_multimodal: bool = False, enforce_access_control: bool = False, ) -> None: - if not search_field_name_embedding: - raise ValueError("search_field_name_embedding must be provided for cloud ingestion") - if not document_extractor_uri: - raise ValueError("document_extractor_uri must be provided for cloud ingestion") - if not document_extractor_auth_resource_id: - raise ValueError("document_extractor_auth_resource_id must be provided for cloud ingestion") - if not figure_processor_uri: - raise ValueError("figure_processor_uri must be provided for cloud ingestion") - if not figure_processor_auth_resource_id: - raise ValueError("figure_processor_auth_resource_id must be provided for cloud ingestion") - if not text_processor_uri: - raise ValueError("text_processor_uri must be provided for cloud ingestion") - if not text_processor_auth_resource_id: - raise ValueError("text_processor_auth_resource_id must be provided for cloud ingestion") - self.list_file_strategy = list_file_strategy self.blob_manager = blob_manager self.document_action = document_action @@ -104,19 +88,19 @@ def __init__( self.indexer_name = f"{prefix}-indexer" self.data_source_name = f"{prefix}-blob" - self.document_extractor = _SkillConfig( + self.document_extractor = SkillConfig( name=f"{prefix}-document-extractor-skill", description="Custom skill that downloads and parses source documents", uri=document_extractor_uri, auth_resource_id=document_extractor_auth_resource_id, ) - self.figure_processor = _SkillConfig( + self.figure_processor = SkillConfig( name=f"{prefix}-figure-processor-skill", description="Custom skill that enriches individual figures", uri=figure_processor_uri, auth_resource_id=figure_processor_auth_resource_id, ) - self.text_processor = _SkillConfig( + self.text_processor = SkillConfig( name=f"{prefix}-text-processor-skill", description="Custom skill that merges figures, chunks text, and generates embeddings", uri=text_processor_uri, @@ -125,35 +109,46 @@ def __init__( self._search_manager: SearchManager | None = None - def _build_search_manager(self) -> SearchManager: - if not isinstance(self.embeddings, AzureOpenAIEmbeddingService): - raise TypeError("Cloud ingestion requires AzureOpenAIEmbeddingService for search index setup") - - return SearchManager( - search_info=self.search_info, - search_analyzer_name=self.search_analyzer_name, - use_acls=self.use_acls, - use_parent_index_projection=True, - embeddings=self.embeddings, - field_name_embedding=self.search_field_name_embedding, - search_images=self.use_multimodal, - enforce_access_control=self.enforce_access_control, - ) + def _build_skillset(self) -> SearchIndexerSkillset: + prefix = f"{self.search_info.index_name}-cloud" - def _build_document_extractor_skill(self) -> WebApiSkill: - outputs = [ - OutputFieldMappingEntry(name="pages", target_name="pages"), - OutputFieldMappingEntry(name="figures", target_name="figures"), + # NOTE: Do NOT map the chunk id directly to the index key field. Azure AI Search + # index projections forbid mapping an input field onto the target index key when + # using parent/child projections. The service will generate keys for projected + # child documents automatically. Removing the explicit 'id' mapping resolves + # HttpResponseError: "Input 'id' cannot map to the key field". + mappings = [ + InputFieldMappingEntry(name="content", source="/document/chunks/*/content"), + InputFieldMappingEntry(name="sourcepage", source="/document/chunks/*/sourcepage"), + InputFieldMappingEntry(name="sourcefile", source="/document/chunks/*/sourcefile"), + InputFieldMappingEntry(name=self.search_field_name_embedding, source="/document/chunks/*/embedding"), + InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), ] + if self.use_multimodal: + mappings.append(InputFieldMappingEntry(name="images", source="/document/chunks/*/images")) - return WebApiSkill( + index_projection = SearchIndexerIndexProjection( + selectors=[ + SearchIndexerIndexProjectionSelector( + target_index_name=self.search_info.index_name, + parent_key_field_name="parent_id", + source_context="/document/chunks/*", + mappings=mappings, + ) + ], + parameters=SearchIndexerIndexProjectionsParameters( + projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS, + ), + ) + + document_extractor_skill = WebApiSkill( name=self.document_extractor.name, description=self.document_extractor.description, context="/document", uri=self.document_extractor.uri, http_method="POST", - timeout=_DEFAULT_TIMEOUT, - batch_size=_DEFAULT_BATCH_SIZE, + timeout=DEFAULT_SKILL_TIMEOUT, + batch_size=DEFAULT_BATCH_SIZE, degree_of_parallelism=1, # Managed identity: Search service authenticates against the function app using this resource ID. auth_resource_id=self.document_extractor.auth_resource_id, @@ -163,106 +158,114 @@ def _build_document_extractor_skill(self) -> WebApiSkill: InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"), InputFieldMappingEntry(name="content_type", source="/document/metadata_storage_content_type"), ], - outputs=outputs, + outputs=[ + OutputFieldMappingEntry(name="pages", target_name="pages"), + OutputFieldMappingEntry(name="figures", target_name="figures"), + ], ) - def _build_figure_processor_skill(self) -> WebApiSkill: - inputs = [ - InputFieldMappingEntry(name="figure_id", source="/document/figures/*/figure_id"), - InputFieldMappingEntry(name="document_file_name", source="/document/figures/*/document_file_name"), - InputFieldMappingEntry(name="filename", source="/document/figures/*/filename"), - InputFieldMappingEntry(name="mime_type", source="/document/figures/*/mime_type"), - InputFieldMappingEntry(name="bytes_base64", source="/document/figures/*/bytes_base64"), - InputFieldMappingEntry(name="page_num", source="/document/figures/*/page_num"), - InputFieldMappingEntry(name="bbox", source="/document/figures/*/bbox"), - ] - outputs = [ - OutputFieldMappingEntry(name="caption", target_name="caption"), - OutputFieldMappingEntry(name="url", target_name="url"), - ] - if self.use_multimodal: - outputs.append(OutputFieldMappingEntry(name="imageEmbedding", target_name="imageEmbedding")) - - return WebApiSkill( + figure_processor_skill = WebApiSkill( name=self.figure_processor.name, description=self.figure_processor.description, context="/document/figures/*", uri=self.figure_processor.uri, http_method="POST", - timeout=_DEFAULT_TIMEOUT, - batch_size=_DEFAULT_BATCH_SIZE, + timeout=DEFAULT_SKILL_TIMEOUT, + batch_size=DEFAULT_BATCH_SIZE, degree_of_parallelism=1, # Managed identity: Search service authenticates against the function app using this resource ID. auth_resource_id=self.figure_processor.auth_resource_id, - inputs=inputs, - outputs=outputs, + inputs=[ + InputFieldMappingEntry(name="figure_id", source="/document/figures/*/figure_id"), + InputFieldMappingEntry(name="document_file_name", source="/document/figures/*/document_file_name"), + InputFieldMappingEntry(name="filename", source="/document/figures/*/filename"), + InputFieldMappingEntry(name="mime_type", source="/document/figures/*/mime_type"), + InputFieldMappingEntry(name="bytes_base64", source="/document/figures/*/bytes_base64"), + InputFieldMappingEntry(name="page_num", source="/document/figures/*/page_num"), + InputFieldMappingEntry(name="bbox", source="/document/figures/*/bbox"), + InputFieldMappingEntry(name="placeholder", source="/document/figures/*/placeholder"), + InputFieldMappingEntry(name="title", source="/document/figures/*/title"), + ], + outputs=[ + # Only output the enriched fields to avoid cyclic dependency + OutputFieldMappingEntry(name="description", target_name="description"), + OutputFieldMappingEntry(name="url", target_name="url"), + OutputFieldMappingEntry(name="embedding", target_name="embedding"), + ], ) - def _build_text_processor_skill(self) -> WebApiSkill: - inputs = [ - InputFieldMappingEntry(name="pages", source="/document/pages"), - InputFieldMappingEntry(name="figures", source="/document/figures"), - InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"), - InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), - ] + # Shaper skill to consolidate pages and enriched figures into a single object + shaper_skill = ShaperSkill( + name=f"{prefix}-document-shaper-skill", + description="Consolidates pages and enriched figures into a single document object", + context="/document", + inputs=[ + InputFieldMappingEntry(name="pages", source="/document/pages"), + InputFieldMappingEntry( + name="figures", + source_context="/document/figures/*", + inputs=[ + InputFieldMappingEntry(name="figure_id", source="/document/figures/*/figure_id"), + InputFieldMappingEntry( + name="document_file_name", source="/document/figures/*/document_file_name" + ), + InputFieldMappingEntry(name="filename", source="/document/figures/*/filename"), + InputFieldMappingEntry(name="mime_type", source="/document/figures/*/mime_type"), + InputFieldMappingEntry(name="page_num", source="/document/figures/*/page_num"), + InputFieldMappingEntry(name="bbox", source="/document/figures/*/bbox"), + InputFieldMappingEntry(name="placeholder", source="/document/figures/*/placeholder"), + InputFieldMappingEntry(name="title", source="/document/figures/*/title"), + InputFieldMappingEntry(name="description", source="/document/figures/*/description"), + InputFieldMappingEntry(name="url", source="/document/figures/*/url"), + InputFieldMappingEntry(name="embedding", source="/document/figures/*/embedding"), + ], + ), + InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"), + InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), + ], + outputs=[OutputFieldMappingEntry(name="output", target_name="consolidated_document")], + ) - return WebApiSkill( + text_processor_skill = WebApiSkill( name=self.text_processor.name, description=self.text_processor.description, context="/document", uri=self.text_processor.uri, http_method="POST", - timeout=_DEFAULT_TIMEOUT, - batch_size=_DEFAULT_BATCH_SIZE, + timeout=DEFAULT_SKILL_TIMEOUT, + batch_size=DEFAULT_BATCH_SIZE, degree_of_parallelism=1, # Managed identity: Search service authenticates against the function app using this resource ID. auth_resource_id=self.text_processor.auth_resource_id, - inputs=inputs, - outputs=[OutputFieldMappingEntry(name="chunks", target_name="chunks")], - ) - - def _build_skillset(self) -> SearchIndexerSkillset: - # NOTE: Do NOT map the chunk id directly to the index key field. Azure AI Search - # index projections forbid mapping an input field onto the target index key when - # using parent/child projections. The service will generate keys for projected - # child documents automatically. Removing the explicit 'id' mapping resolves - # HttpResponseError: "Input 'id' cannot map to the key field". - mappings = [ - InputFieldMappingEntry(name="content", source="/document/chunks/*/content"), - InputFieldMappingEntry(name="sourcepage", source="/document/chunks/*/sourcepage"), - InputFieldMappingEntry(name="sourcefile", source="/document/chunks/*/sourcefile"), - InputFieldMappingEntry(name=self.search_field_name_embedding, source="/document/chunks/*/embedding"), - InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), - ] - - index_projection = SearchIndexerIndexProjection( - selectors=[ - SearchIndexerIndexProjectionSelector( - target_index_name=self.search_info.index_name, - parent_key_field_name="parent_id", - source_context="/document/chunks/*", - mappings=mappings, - ) + inputs=[ + InputFieldMappingEntry(name="consolidated_document", source="/document/consolidated_document"), ], - parameters=SearchIndexerIndexProjectionsParameters( - projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS, - ), + outputs=[OutputFieldMappingEntry(name="chunks", target_name="chunks")], ) return SearchIndexerSkillset( name=self.skillset_name, description="Skillset linking document extraction, figure enrichment, and text processing functions", - skills=[ - self._build_document_extractor_skill(), - self._build_figure_processor_skill(), - self._build_text_processor_skill(), - ], + skills=[document_extractor_skill, figure_processor_skill, shaper_skill, text_processor_skill], index_projection=index_projection, ) async def setup(self) -> None: logger.info("Setting up search index and skillset for cloud ingestion") - self._search_manager = self._build_search_manager() + + if not isinstance(self.embeddings, AzureOpenAIEmbeddingService): + raise TypeError("Cloud ingestion requires AzureOpenAIEmbeddingService for search index setup") + self._search_manager = SearchManager( + search_info=self.search_info, + search_analyzer_name=self.search_analyzer_name, + use_acls=self.use_acls, + use_parent_index_projection=True, + embeddings=self.embeddings, + field_name_embedding=self.search_field_name_embedding, + search_images=self.use_multimodal, + enforce_access_control=self.enforce_access_control, + ) + await self._search_manager.create_index() async with self.search_info.create_search_indexer_client() as indexer_client: @@ -303,9 +306,7 @@ async def run(self) -> None: parameters=IndexingParameters( configuration=IndexingParametersConfiguration( query_timeout=None, - # markdown_parsing_submode=None, data_to_extract="storageMetadata", - # markdown_header_depth=None, allow_skillset_to_read_file_data=True, ) ), diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py index f996182651..3eb8594f99 100644 --- a/app/backend/prepdocslib/filestrategy.py +++ b/app/backend/prepdocslib/filestrategy.py @@ -118,7 +118,7 @@ async def run(self): files = self.list_file_strategy.list() async for file in files: try: - await self.blob_manager.upload_blob(file) + blob_url = await self.blob_manager.upload_blob(file) sections = await parse_file( file, self.file_processors, @@ -128,7 +128,7 @@ async def run(self): figure_processor=self.figure_processor, ) if sections: - await self.search_manager.update_content(sections, url=file.url) + await self.search_manager.update_content(sections, url=blob_url) finally: if file: file.close() diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index 3cfaba6819..0015df05ad 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -52,20 +52,25 @@ def to_skill_payload( b = self.bytes if isinstance(self.bytes, (bytes, bytearray)) else b"" data["bytes_base64"] = base64.b64encode(b).decode("utf-8") + # Remove None values to prevent document extractor from emitting fields that will be + # enriched by figure processor, avoiding potential conflicts in Azure AI Search enrichment merge + data = {k: v for k, v in data.items() if v is not None} + data["document_file_name"] = file_name return data @classmethod def from_skill_payload(cls, data: dict[str, Any]) -> tuple["ImageOnPage", str]: """Deserialize a figure skill payload into an ImageOnPage, normalizing fields.""" - # Decode base64 image data + # Decode base64 image data (optional - may be omitted if already persisted to blob) bytes_base64 = data.get("bytes_base64") - if not bytes_base64: - raise ValueError("Figure payload missing required bytes_base64 field") - try: - raw_bytes = base64.b64decode(bytes_base64) - except Exception as exc: # pragma: no cover - defensive - raise ValueError("Invalid bytes_base64 image data") from exc + if bytes_base64: + try: + raw_bytes = base64.b64decode(bytes_base64) + except Exception as exc: # pragma: no cover - defensive + raise ValueError("Invalid bytes_base64 image data") from exc + else: + raw_bytes = b"" # Empty bytes if not provided (already uploaded to blob) # page_num may arrive as str; coerce try: @@ -89,6 +94,8 @@ def from_skill_payload(cls, data: dict[str, Any]) -> tuple["ImageOnPage", str]: placeholder=data.get("placeholder"), mime_type=data.get("mime_type") or "image/png", title=data.get("title"), + description=data.get("description"), + url=data.get("url"), ) return image, data.get("document_file_name", "") diff --git a/app/backend/prepdocslib/textprocessor.py b/app/backend/prepdocslib/textprocessor.py index 595951a0fb..d7d1d64aad 100644 --- a/app/backend/prepdocslib/textprocessor.py +++ b/app/backend/prepdocslib/textprocessor.py @@ -19,10 +19,12 @@ def combine_text_with_figures(page: "Page") -> None: After figures have been described and enriched, this replaces their placeholders in the page text with the full
markup. """ + from .figureprocessor import build_figure_markup for image in page.images: if image.description and image.placeholder in page.text: - page.text = page.text.replace(image.placeholder, image.description) + figure_markup = build_figure_markup(image, image.description) + page.text = page.text.replace(image.placeholder, figure_markup) logger.info("Replaced placeholder for figure %s with description markup", image.figure_id) elif not image.description: logger.debug("No description for figure %s; keeping placeholder", image.figure_id) diff --git a/app/functions/figure_processor/function_app.py b/app/functions/figure_processor/function_app.py index aa82a8b2b2..e4074dace5 100644 --- a/app/functions/figure_processor/function_app.py +++ b/app/functions/figure_processor/function_app.py @@ -44,7 +44,7 @@ CONTENT_UNDERSTANDING_ENDPOINT = os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT", "") AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE", "") AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL", "") -AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "") +AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-06-01") AZURE_OPENAI_CHATGPT_DEPLOYMENT = os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "") AZURE_OPENAI_CHATGPT_MODEL = os.getenv("AZURE_OPENAI_CHATGPT_MODEL", "") AZURE_VISION_ENDPOINT = os.getenv("AZURE_VISION_ENDPOINT", "") @@ -141,6 +141,12 @@ async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse: data = record.get("data", {}) try: image_on_page, file_name = ImageOnPage.from_skill_payload(data) + logger.info( + "Figure processor input for %s: url=%s, description=%s", + image_on_page.figure_id, + image_on_page.url, + image_on_page.description, + ) await process_page_image( image=image_on_page, document_filename=file_name, @@ -148,7 +154,19 @@ async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse: image_embeddings_client=IMAGE_EMBEDDINGS, figure_processor=FIGURE_PROCESSOR, ) + logger.info( + "Figure processor after enrichment for %s: url=%s, description=%s", + image_on_page.figure_id, + (image_on_page.url or "NONE")[:100], + (image_on_page.description or "NONE")[:100], + ) figure_payload = image_on_page.to_skill_payload(file_name, include_bytes_base64=False, include_bytes=False) + logger.info( + "Figure processor returning payload for %s: url='%s', description='%s'", + image_on_page.figure_id, + figure_payload.get("url", "MISSING")[:100] if figure_payload.get("url") else "NONE", + figure_payload.get("description", "MISSING")[:100] if figure_payload.get("description") else "NONE", + ) output_values.append( { "recordId": record_id, diff --git a/app/functions/text_processor/function_app.py b/app/functions/text_processor/function_app.py index 0eff4c5f76..25f2c15389 100644 --- a/app/functions/text_processor/function_app.py +++ b/app/functions/text_processor/function_app.py @@ -139,7 +139,7 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: Parameters ---------- data: dict[str, Any] - Skill payload containing file metadata, pages, and figures. + Skill payload containing consolidated_document with file metadata, pages, and figures. Returns ------- @@ -147,12 +147,39 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: Chunk dictionaries ready for downstream indexing. """ - file_name = data.get("file_name", "document") - storage_url = data.get("storageUrl") or data.get("metadata_storage_path") or file_name - pages_input = data.get("pages", []) # [{page_num, text, figure_ids}] - figures_input = data.get("figures", []) # serialized skill payload + # Extract consolidated_document object from Shaper skill + consolidated_doc = data.get("consolidated_document", data) + + file_name = consolidated_doc.get("file_name", "document") + storage_url = consolidated_doc.get("storageUrl") or consolidated_doc.get("metadata_storage_path") or file_name + pages_input = consolidated_doc.get("pages", []) # [{page_num, text, figure_ids}] + figures_input = consolidated_doc.get("figures", []) # serialized skill payload + + # Merge enriched fields from figure processor into figures array + enriched_descriptions = data.get("enriched_descriptions", []) + enriched_urls = data.get("enriched_urls", []) + enriched_embeddings = data.get("enriched_embeddings", []) + + for i, figure in enumerate(figures_input): + if i < len(enriched_descriptions): + figure["description"] = enriched_descriptions[i] + if i < len(enriched_urls): + figure["url"] = enriched_urls[i] + if i < len(enriched_embeddings): + figure["embedding"] = enriched_embeddings[i] + + # Debug: log the first figure to see what fields are present + if figures_input: + logger.info("DEBUG: First figure keys after merge: %s", list(figures_input[0].keys())) + logger.info( + "DEBUG: First figure sample after merge: %s", + {k: str(v)[:50] if v else v for k, v in list(figures_input[0].items())[:10]}, + ) + figures_by_id = {figure["figure_id"]: figure for figure in figures_input} + logger.info("Processing %s: %d pages, %d figures", file_name, len(pages_input), len(figures_input)) + # Build Page objects with placeholders intact (figure markup will be injected by combine_text_with_figures()) pages: list[Page] = [] offset = 0 @@ -170,8 +197,31 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: if not figure_payload: logger.warning("Figure ID %s not found in figures metadata for page %d", fid, page_num) continue - image_on_page, _ = ImageOnPage.from_skill_payload(figure_payload) - page_obj.images.append(image_on_page) + logger.info( + "Deserializing figure %s: has description=%s, has url=%s, has bytes_base64=%s", + fid, + "description" in figure_payload, + "url" in figure_payload, + "bytes_base64" in figure_payload, + ) + logger.info( + "Figure %s payload values: description='%s', url='%s'", + fid, + figure_payload.get("description", "MISSING")[:100] if figure_payload.get("description") else "NONE", + figure_payload.get("url", "MISSING")[:100] if figure_payload.get("url") else "NONE", + ) + try: + image_on_page, _ = ImageOnPage.from_skill_payload(figure_payload) + logger.info( + "Figure %s deserialized: description='%s', url='%s', placeholder=%s", + fid, + (image_on_page.description or "NONE")[:100], + image_on_page.url or "NONE", + image_on_page.placeholder, + ) + page_obj.images.append(image_on_page) + except Exception as exc: + logger.error("Failed to deserialize figure %s: %s", fid, exc, exc_info=True) pages.append(page_obj) if not pages: @@ -207,15 +257,12 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: image_refs: list[dict[str, Any]] = [] for image in section.chunk.images: ref = { - "id": image.figure_id, "url": image.url or "", - "caption": image.title or image.figure_id, - "bbox": list(image.bbox), + "description": image.description or "", + "boundingbox": list(image.bbox), } - # Optionally surface plain description separately (strip markup) if needed later. - # Since image.description now holds markup, we do not include it here by default. if USE_MULTIMODAL and image.embedding is not None: - ref["imageEmbedding"] = image.embedding + ref["embedding"] = image.embedding image_refs.append(ref) chunk_entry: dict[str, Any] = { "id": f"{normalized_id}-{idx:04d}", diff --git a/docs/cloud_ingestion.md b/docs/cloud_ingestion.md index 35720b4e80..506aa444a7 100644 --- a/docs/cloud_ingestion.md +++ b/docs/cloud_ingestion.md @@ -23,7 +23,7 @@ This architecture enables serverless, scalable, and event-driven document proces ┌─────────────────────────────────────────────────────────────────┐ │ Azure AI Search Indexer │ │ - Blob data source (monitors content container) │ -│ - Skillset with 3 chained custom skills │ +│ - Skillset with 4 chained skills (3 custom + 1 built-in) │ │ - Runs on schedule or on-demand │ │ - Handles retries, checkpointing, state tracking │ └──────────────────────┬──────────────────────────────────────────┘ @@ -33,13 +33,14 @@ This architecture enables serverless, scalable, and event-driven document proces │ SKILL #1: document_extractor│ │ (Flex Consumption Function) │ │ HTTP Trigger │ + │ Context: /document │ │ Timeout: 10 minutes │ └─────────────┬────────────────┘ │ - Input: │ Output: - • Blob URL │ • Markdown text with figure anchors - • File metadata │ • Page metadata (text + figure ids) - │ • Figures array (metadata + base64 image) + Input: │ Output to /document: + • Blob URL │ • pages[] (text + figure ids) + • File metadata │ • figures[] (metadata + base64 image) + │ Processes: │ • Download blob │ • Document Intelligence @@ -51,15 +52,16 @@ This architecture enables serverless, scalable, and event-driven document proces │ SKILL #2: figure_processor │ │ (Flex Consumption Function)│ │ HTTP Trigger │ + │ Context: /document/figures/*│ │ Timeout: 6 minutes │ │ Memory: 3072 MB │ └─────────────┬───────────────┘ │ - Context: │ Output: - • /document/figures/*│ • Figure url (blob SAS) - Input values: │ • Figure caption - • Figure bytes │ • Figure embedding vector - • Figure metadata │ + Input (per figure): │ Output to /document/figures/*: + • Figure bytes │ • description (enriched) + • Figure metadata │ • url (enriched) + • placeholder │ • embedding (enriched) + • title │ Processes: │ • Upload to blob │ • Describe via LLM │ @@ -67,19 +69,43 @@ This architecture enables serverless, scalable, and event-driven document proces │ ▼ ┌─────────────────────────────┐ - │ SKILL #3: text_processor │ + │ SKILL #3: Shaper Skill │ + │ (Built-in Azure AI Search) │ + │ Context: /document │ + └─────────────┬───────────────┘ + │ + Purpose: │ Output to /document: + • Consolidate data │ • consolidated_document: + │ - pages[] (from skill #1) + Shaper combines: │ - figures[] (enriched from skill #2) + • Original pages │ - file_name + • Enriched figures │ - storageUrl + • File metadata │ + │ + Why needed: │ + Azure AI Search enrichment tree isolates contexts. + Data enriched at /document/figures/* doesn't automatically + merge into /document scope. Shaper explicitly consolidates + all fields into a single object for downstream consumption. + │ + ▼ + ┌─────────────────────────────┐ + │ SKILL #4: text_processor │ │ (Combines, splits, embeds) │ │ HTTP Trigger │ + │ Context: /document │ │ Timeout: 5 minutes │ │ Memory: 2048 MB │ └─────────────┬───────────────┘ │ Input: │ Output: - • Full markdown │ • Array of chunks with: - • Processed figures │ - Content text - • File metadata │ - Text embeddings - Processes: │ - Figure references + embeddings - • Enrich placeholders│ - Metadata (sourcepage, etc.) + • consolidated_doc │ • Array of chunks with: + - pages[] │ - Content text + - figures[] │ - Text embeddings + - file_name │ - Figure references + embeddings + - storageUrl │ - Metadata (sourcepage, etc.) + Processes: │ + • Enrich placeholders│ • Split text │ • Generate embeddings│ │ @@ -214,9 +240,8 @@ This architecture enables serverless, scalable, and event-driven document proces { "recordId": "1", "data": { - "id": "fig1", "url": "https://storage.../images/doc-fig1.png", - "caption": "Bar chart showing quarterly revenue", + "description": "
Bar chart showing quarterly revenue
", "imageEmbedding": [0.789, -0.012, ...] }, "errors": [], @@ -226,7 +251,85 @@ This architecture enables serverless, scalable, and event-driven document proces } ``` -### 3. Text Processor Function +### 4. Shaper Skill (Built-in) + +**Type:** Built-in Azure AI Search skill + +**Purpose:** Consolidates enrichments from different contexts into a single object. + +**Why Needed:** + +Azure AI Search's enrichment tree isolates data by context. When the `figure_processor` skill runs at context `/document/figures/*`, it enriches individual figure objects (adding `description`, `url`, `embedding`). However, these enrichments remain isolated in the `/document/figures/*` context and don't automatically merge into the `/document` context where the `text_processor` skill operates. + +The Shaper skill explicitly consolidates: + +- Original `pages` array from `document_extractor` +- Enriched `figures` array with `description`, `url`, `embedding` from `figure_processor` +- File metadata (`file_name`, `storageUrl`) + +This consolidated object is then passed to the `text_processor` skill, ensuring it receives all enriched data in a single, well-structured input. + +**Configuration:** + +- Context: `/document` +- Uses nested `inputs` syntax with `source_context` for array consolidation +- Output: `consolidated_document` object containing all required fields + +**Input Mapping:** + +```python +ShaperSkill( + name="document-shaper-skill", + context="/document", + inputs=[ + InputFieldMappingEntry(name="pages", source="/document/pages"), + InputFieldMappingEntry( + name="figures", + source_context="/document/figures/*", + inputs=[ + InputFieldMappingEntry(name="figure_id", source="/document/figures/*/figure_id"), + InputFieldMappingEntry(name="filename", source="/document/figures/*/filename"), + # ... other figure fields + InputFieldMappingEntry(name="description", source="/document/figures/*/description"), + InputFieldMappingEntry(name="url", source="/document/figures/*/url"), + InputFieldMappingEntry(name="embedding", source="/document/figures/*/embedding"), + ] + ), + InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"), + InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), + ], + outputs=[ + OutputFieldMappingEntry(name="output", target_name="consolidated_document") + ] +) +``` + +**Output Format:** + +The Shaper skill produces a `consolidated_document` object at `/document/consolidated_document`: + +```json +{ + "consolidated_document": { + "pages": [ + {"page_num": 0, "text": "...", "figure_ids": ["1.1"]} + ], + "figures": [ + { + "figure_id": "1.1", + "filename": "figure1_1.png", + "description": "The image shows a logo...", + "url": "https://storage.../images/doc/figure1_1.png", + "embedding": [0.123, -0.456, ...] + } + ], + "file_name": "document.pdf", + "storageUrl": "https://storage.../content/document.pdf" + } +} +``` + +### 5. Text Processor Function **Location:** `app/functions/text_processor/` From 7bec324bd35b2be171aa449afa99b28b93890b82 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 10 Nov 2025 11:41:43 -0800 Subject: [PATCH 15/30] Working more on the docs --- .github/prompts/testcov.prompt.md | 27 ++ AGENTS.md | 30 ++ README.md | 2 +- app/backend/prepdocslib/cloud_vs_local.txt | 53 --- app/functions/__init__.py | 0 app/functions/text_processor/function_app.py | 1 + azure.yaml | 42 +- docs/cloud_ingestion.md | 7 +- docs/data_ingestion.md | 102 +++-- docs/deploy_features.md | 27 +- pyproject.toml | 4 +- scripts/compare_search_indexes.py | 298 ++++++++++++++ tests/test_function_apps.py | 396 +++++++++++++++++++ 13 files changed, 865 insertions(+), 124 deletions(-) create mode 100644 .github/prompts/testcov.prompt.md delete mode 100644 app/backend/prepdocslib/cloud_vs_local.txt create mode 100644 app/functions/__init__.py create mode 100644 scripts/compare_search_indexes.py create mode 100644 tests/test_function_apps.py diff --git a/.github/prompts/testcov.prompt.md b/.github/prompts/testcov.prompt.md new file mode 100644 index 0000000000..76a318deb9 --- /dev/null +++ b/.github/prompts/testcov.prompt.md @@ -0,0 +1,27 @@ +--- +agent: agent +--- + +The goal is for the tests to cover all lines of code. + +Generate a coverage report with: + +pytest --cov --cov-report=annotate:cov_annotate + +If you are checking for coverage of a specific module, you can specify it like this: + +pytest --cov=your_module_name --cov-report=annotate:cov_annotate + +You can also specify specific tests to run, for example: + +pytest tests/test_your_module.py --cov=your_module_name --cov-report=annotate:cov_annotate + +Open the cov_annotate directory to view the annotated source code. +There will be one file per source file. If a file has 100% source coverage, it means all lines are covered by tests, so you do not need to open the file. + +For each file that has less than 100% test coverage, find the matching file in cov_annotate and review the file. + +If a line starts with a ! (exclamation mark), it means that the line is not covered by tests. +Add tests to cover the missing lines. + +Keep running the tests and improving coverage until all lines are covered. diff --git a/AGENTS.md b/AGENTS.md index 7ad6d26b84..9841965676 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -120,3 +120,33 @@ We only enforce type hints in the main application code and scripts. ## Python code style Do not use single underscores in front of "private" methods or variables in Python code. We do not follow that convention in this codebase, since this is an application and not a library. + +## Deploying the application + +To deploy the application, use the `azd` CLI tool. Make sure you have the latest version of the `azd` CLI installed. Then, run the following command from the root of the repository: + +```shell +azd up +``` + +That command will BOTH provision the Azure resources AND deploy the application code. + +If you only changed the Bicep templates and want to re-provision the Azure resources, run: + +```shell +azd provision +``` + +If you only changed the application code and want to re-deploy the code, run: + +```shell +azd deploy +``` + +If you are using cloud ingestion and only want to deploy individual functions, run the necessary deploy commands, for example: + +```shell +azd deploy document-extractor +azd deploy figure-processor +azd deploy text-processor +``` diff --git a/README.md b/README.md index f53d895551..f2f6ec8dd1 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ The repo includes sample data so it's ready to try end to end. In this sample ap - Chat (multi-turn) and Q&A (single turn) interfaces - Renders citations and thought process for each answer - Includes settings directly in the UI to tweak the behavior and experiment with options -- Integrates Azure AI Search for indexing and retrieval of documents, with support for [many document formats](/docs/data_ingestion.md#supported-document-formats) as well as [integrated vectorization](/docs/data_ingestion.md#overview-of-integrated-vectorization) +- Integrates Azure AI Search for indexing and retrieval of documents, with support for [many document formats](/docs/data_ingestion.md#supported-document-formats) as well as [cloud-based data ingestion](/docs/data_ingestion.md#overview-of-cloud-based-vectorization) - Optional usage of [multimodal models](/docs/multimodal.md) to reason over image-heavy documents - Optional addition of [speech input/output](/docs/deploy_features.md#enabling-speech-inputoutput) for accessibility - Optional automation of [user login and data access](/docs/login_and_acl.md) via Microsoft Entra diff --git a/app/backend/prepdocslib/cloud_vs_local.txt b/app/backend/prepdocslib/cloud_vs_local.txt deleted file mode 100644 index bd1d2a67ec..0000000000 --- a/app/backend/prepdocslib/cloud_vs_local.txt +++ /dev/null @@ -1,53 +0,0 @@ -search query: - -{ - "search": "*", - "count": true, - "filter": "sourcepage eq 'Northwind_Health_Plus_Benefits_Details.pdf#page=1'", - "facets": ["sourcepage"] -} - -incorrect = { - "@search.score": 0.5914636, - "id": "7f5de0fafb88_aHR0cHM6Ly9zdGQ0Z2ZiYWpuM2UzeXUuYmxvYi5jb3JlLndpbmRvd3MubmV0L2NvbnRlbnQvTm9ydGh3aW5kX0hlYWx0aF9QbHVzX0JlbmVmaXRzX0RldGFpbHMucGRm0_chunks_0", - "content": "# Contoso Electronics\n\nNorthwind Health Plus Plan\n\n\n
", - "category": null, - "sourcepage": "Northwind_Health_Plus_Benefits_Details.pdf#page=1", - "sourcefile": "Northwind_Health_Plus_Benefits_Details.pdf", - "storageUrl": "https://std4gfbajn3e3yu.blob.core.windows.net/content/Northwind_Health_Plus_Benefits_Details.pdf", - "parent_id": "aHR0cHM6Ly9zdGQ0Z2ZiYWpuM2UzeXUuYmxvYi5jb3JlLndpbmRvd3MubmV0L2NvbnRlbnQvTm9ydGh3aW5kX0hlYWx0aF9QbHVzX0JlbmVmaXRzX0RldGFpbHMucGRm0", - "images": [ - { - "url": "", - "description": "", - "boundingbox": [ - 373.97, - 620.07, - 575.79, - 701.93 - ] - } - ] - } - -correct = { - "@search.score": 1, - "id": "file-Northwind_Health_Plus_Benefits_Details_pdf-4E6F72746877696E645F4865616C74685F506C75735F42656E65666974735F44657461696C732E706466-page-0", - "content": "# Contoso Electronics\n\nNorthwind Health Plus Plan\n\n\nThe image shows a logo consisting of a graphic and text. The graphic is a stylized blue drone with four propellers depicted by four curving lines at the ends of its arms. To the right of the drone graphic, there is the text \"Contoso Electronics\" written in black. The text is aligned vertically with \"Contoso\" on top and \"Electronics\" below it. The overall design is simple and clean.", - "category": null, - "sourcepage": "Northwind_Health_Plus_Benefits_Details.pdf#page=1", - "sourcefile": "Northwind_Health_Plus_Benefits_Details.pdf", - "storageUrl": "https://std4gfbajn3e3yu.blob.core.windows.net/content/Northwind_Health_Plus_Benefits_Details.pdf", - "images": [ - { - "url": "https://std4gfbajn3e3yu.blob.core.windows.net/images/Northwind_Health_Plus_Benefits_Details.pdf/page0/figure1_1.png", - "description": "The image shows a logo consisting of a graphic and text. The graphic is a stylized blue drone with four propellers depicted by four curving lines at the ends of its arms. To the right of the drone graphic, there is the text \"Contoso Electronics\" written in black. The text is aligned vertically with \"Contoso\" on top and \"Electronics\" below it. The overall design is simple and clean.", - "boundingbox": [ - 373.97, - 620.07, - 575.79, - 701.93 - ] - } - ] - } diff --git a/app/functions/__init__.py b/app/functions/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/app/functions/text_processor/function_app.py b/app/functions/text_processor/function_app.py index 25f2c15389..2fc345e5f9 100644 --- a/app/functions/text_processor/function_app.py +++ b/app/functions/text_processor/function_app.py @@ -156,6 +156,7 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: figures_input = consolidated_doc.get("figures", []) # serialized skill payload # Merge enriched fields from figure processor into figures array + # TODO: possibly remove enriched_*, are they actually needed? enriched_descriptions = data.get("enriched_descriptions", []) enriched_urls = data.get("enriched_urls", []) enriched_embeddings = data.get("enriched_embeddings", []) diff --git a/azure.yaml b/azure.yaml index 24389c5af6..1079a4239d 100644 --- a/azure.yaml +++ b/azure.yaml @@ -46,48 +46,30 @@ services: host: function hooks: prepackage: - windows: - shell: pwsh - run: python ../../../scripts/copy_prepdocslib.py - interactive: false - continueOnError: false - posix: - shell: sh - run: python ../../../scripts/copy_prepdocslib.py - interactive: false - continueOnError: false + shell: pwsh + run: python ../../../scripts/copy_prepdocslib.py + interactive: false + continueOnError: false figure-processor: project: ./app/functions/figure_processor language: py host: function hooks: prepackage: - windows: - shell: pwsh - run: python ../../../scripts/copy_prepdocslib.py - interactive: false - continueOnError: false - posix: - shell: sh - run: python ../../../scripts/copy_prepdocslib.py - interactive: false - continueOnError: false + shell: pwsh + run: python ../../../scripts/copy_prepdocslib.py + interactive: false + continueOnError: false text-processor: project: ./app/functions/text_processor language: py host: function hooks: prepackage: - windows: - shell: pwsh - run: python ../../../scripts/copy_prepdocslib.py - interactive: false - continueOnError: false - posix: - shell: sh - run: python ../../../scripts/copy_prepdocslib.py - interactive: false - continueOnError: false + shell: pwsh + run: python ../../../scripts/copy_prepdocslib.py + interactive: false + continueOnError: false hooks: preprovision: windows: diff --git a/docs/cloud_ingestion.md b/docs/cloud_ingestion.md index 506aa444a7..6b320bd7cd 100644 --- a/docs/cloud_ingestion.md +++ b/docs/cloud_ingestion.md @@ -1,6 +1,6 @@ -# Cloud-Based Data Ingestion with Azure Functions +# RAG chat: Cloud-Based data ingestion with Azure Functions -This document describes the cloud-based ingestion architecture that uses Azure Functions as custom skills for Azure AI Search indexers. +This document describes the cloud-based ingestion architecture that uses Azure Functions as custom skills for Azure AI Search indexer. ## Overview @@ -14,6 +14,9 @@ This architecture enables serverless, scalable, and event-driven document proces ## Architecture +TODO: Replace with a mermaid diagram like textsplitter has, +OR use my images from slides. + ```ascii ┌─────────────────────────────────────────────────────────────────┐ │ USER: Upload files to blob storage (content container) │ diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md index c9f7b13410..18f9651c8b 100644 --- a/docs/data_ingestion.md +++ b/docs/data_ingestion.md @@ -2,15 +2,18 @@ The [azure-search-openai-demo](/) project can set up a full RAG chat app on Azure AI Search and OpenAI so that you can chat on custom data, like internal enterprise data or domain-specific knowledge sets. For full instructions on setting up the project, consult the [main README](/README.md), and then return here for detailed instructions on the data ingestion component. -The chat app provides two ways to ingest data: manual indexing and integrated vectorization. This document explains the differences between the two approaches and provides an overview of the manual indexing process. +The chat app provides two ways to ingest data: manual ingestion and cloud-based ingestion. Both approaches use the same code for processing the data, but the manual ingestion runs locally while cloud ingestion runs in Azure Functions as Azure AI Search custom skills. - [Supported document formats](#supported-document-formats) -- [Manual indexing process](#manual-indexing-process) - - [Chunking](#chunking) +- [Ingestion stages](#ingestion-stages) + - [Document extraction](#document-extraction) + - [Figure processing](#figure-processing) + - [Text processing](#text-processing) +- [Local ingestion](#local-ingestion) - [Categorizing data for enhanced search](#enhancing-search-functionality-with-data-categorization) - [Indexing additional documents](#indexing-additional-documents) - [Removing documents](#removing-documents) -- [Integrated Vectorization](#integrated-vectorization) +- [Cloud-based ingestion](#cloud-based-ingestion) - [Indexing of additional documents](#indexing-of-additional-documents) - [Removal of documents](#removal-of-documents) - [Scheduled indexing](#scheduled-indexing) @@ -30,9 +33,72 @@ In order to ingest a document format, we need a tool that can turn it into text. | JSON | Yes (Local) | Yes | | CSV | Yes (Local) | Yes | -The Blob indexer used by the Integrated Vectorization approach also supports a few [additional formats](https://learn.microsoft.com/azure/search/search-howto-indexing-azure-blob-storage#supported-document-formats). +## Ingestion stages -## Manual indexing process +The ingestion pipeline consists of three main stages that transform raw documents into searchable content in Azure AI Search. These stages apply to both local ingestion (using `prepdocs.py`) and cloud-based ingestion (using Azure Functions as custom skills). + +### Document extraction + +The first stage extracts text and structured content from source documents using parsers tailored to each file format. For PDF, HTML, DOCX, PPTX, XLSX, and image files, the pipeline defaults to using [Azure Document Intelligence](https://learn.microsoft.com/azure/ai-services/document-intelligence/overview) to extract text, tables, and figures with layout information. Alternatively, local parsers like PyPDF and BeautifulSoup can be used to reduce costs for simpler documents. For TXT, JSON, and CSV files, lightweight local parsers extract the content directly. + +During extraction, tables are converted to HTML markup to preserve their structure, and figures (when multimodal is enabled) are identified with bounding boxes and placeholders. + +The output from this stage is a list of pages, each containing the extracted text with embedded table HTML and figure placeholders like `
`. + +### Figure processing + +This stage is optional and only applies when the multimodal feature is enabled *and* the document itself has figures. See [multimodal feature documentation](./multimodal.md) for more details. + +When multimodal support is enabled, figures extracted in the previous stage are enriched with descriptions and embeddings. Each figure is: + +1. **Cropped and saved**: The figure image is cropped from the PDF using its bounding box coordinates and saved as a PNG file. +2. **Described**: A text description is generated using either Azure OpenAI's GPT-4 Vision model or Azure AI Content Understanding, depending on configuration. +3. **Uploaded**: The figure image is uploaded to Azure Blob Storage and assigned a URL. +4. **Embedded** (optional): If image embeddings are enabled, a vector embedding is computed for the figure using Azure AI Vision. + +The output from this stage is enriched figure metadata, including the description text, storage URL, and optional embedding vector. + +### Text processing + +The final stage combines the extracted text with figure descriptions, splits the content into searchable chunks, and computes embeddings. + +### Figure merging + +First, figure placeholders in the page text are replaced with full HTML markup that includes the figure caption and generated description, creating a cohesive text narrative that incorporates visual content. + +#### Chunking + +Next, the combined text is split into chunks using a sentence-aware splitter that respects semantic boundaries. The default chunk size is approximately 1000 characters (roughly 400-500 tokens for English), with a 10% overlap between consecutive chunks to preserve context across boundaries. The splitter uses a sliding window approach, ensuring that sentences ending one chunk also start the next, which reduces the risk of losing important context at chunk boundaries. + +**Why chunk documents?** While Azure AI Search can index full documents, chunking is essential for the RAG pattern because it limits the amount of information sent to OpenAI, which has token limits for context windows. By breaking content into focused chunks, the system can retrieve and inject only the most relevant pieces of text into the LLM prompt, improving both response quality and cost efficiency. + +If needed, you can modify the chunking algorithm in `app/backend/prepdocslib/textsplitter.py`. For a deeper, diagram-rich explanation of how the splitter works (figures, recursion, merge heuristics, guarantees, and examples), see the [text splitter documentation](./textsplitter.md). + +#### Embedding + +Finally, if vector search is enabled, text embeddings are computed for each chunk using Azure OpenAI's embedding models (text-embedding-ada-002, text-embedding-3-small, or text-embedding-3-large). These embeddings are generated in batches for efficiency, with retry logic to handle rate limits. + +### Indexing + +The final step is to index the chunks into Azure AI Search. Each chunk is stored as a separate document in the search index, with metadata linking it back to the source file and page number. If vector search is enabled, the computed embeddings are also stored alongside the text, enabling efficient similarity searches during query time. + +Here's an example of what a final indexed chunk document looks like: + +```json +{ + "id": "file-Northwind_Health_Plus_Benefits_Details_pdf-4E6F72746877696E645F4865616C74685F506C75735F42656E65666974735F44657461696C732E706466-page-0", + "content": "# Contoso Electronics\n\nNorthwind Health Plus Plan\n...", + "category": null, + "sourcepage": "Northwind_Health_Plus_Benefits_Details.pdf#page=1", + "sourcefile": "Northwind_Health_Plus_Benefits_Details.pdf", + "storageUrl": "https://std4gfbajn3e3yu.blob.core.windows.net/content/Northwind_Health_Plus_Benefits_Details.pdf", + "embedding": [0.0123, -0.0456, ...] +} +``` + +If multimodal is enabled, that document will also include an `"images"` field and figure descriptions in the `"content"` field. + +## Local ingestion The [`prepdocs.py`](../app/backend/prepdocs.py) script is responsible for both uploading and indexing documents. The typical usage is to call it using `scripts/prepdocs.sh` (Mac/Linux) or `scripts/prepdocs.ps1` (Windows), as these scripts will set up a Python virtual environment and pass in the required parameters based on the current `azd` environment. You can pass additional arguments directly to the script, for example `scripts/prepdocs.ps1 --removeall`. Whenever `azd up` or `azd provision` is run, the script is called automatically. @@ -45,14 +111,6 @@ The script uses the following steps to index documents: 3. Split the PDFs into chunks of text. 4. Upload the chunks to Azure AI Search. If using vectors (the default), also compute the embeddings and upload those alongside the text. -### Chunking - -We're often asked why we need to break up the PDFs into chunks when Azure AI Search supports searching large documents. - -Chunking allows us to limit the amount of information we send to OpenAI due to token limits. By breaking up the content, it allows us to easily find potential chunks of text that we can inject into OpenAI. The method of chunking we use leverages a sliding window of text such that sentences that end one chunk will start the next. This allows us to reduce the chance of losing the context of the text. - -If needed, you can modify the chunking algorithm in `app/backend/prepdocslib/textsplitter.py`. For a deeper, diagram-rich explanation of how the splitter works (figures, recursion, merge heuristics, guarantees, and examples), see the [text splitter documentation](./textsplitter.md). - ### Enhancing search functionality with data categorization To enhance search functionality, categorize data during the ingestion process with the `--category` argument, for example `scripts/prepdocs.ps1 --category ExampleCategoryName`. This argument specifies the category to which the data belongs, enabling you to filter search results based on these categories. @@ -63,7 +121,7 @@ After running the script with the desired category, ensure these categories are To upload more PDFs, put them in the data/ folder and run `./scripts/prepdocs.sh` or `./scripts/prepdocs.ps1`. -A [recent change](https://github.com/Azure-Samples/azure-search-openai-demo/pull/835) added checks to see what's been uploaded before. The prepdocs script now writes an .md5 file with an MD5 hash of each file that gets uploaded. Whenever the prepdocs script is re-run, that hash is checked against the current hash and the file is skipped if it hasn't changed. +The prepdocs script writes an .md5 file with an MD5 hash of each file that gets uploaded. Whenever the prepdocs script is re-run, that hash is checked against the current hash and the file is skipped if it hasn't changed. ### Removing documents @@ -73,19 +131,13 @@ To remove all documents, use `./scripts/prepdocs.sh --removeall` or `./scripts/p You can also remove individual documents by using the `--remove` flag. Open either `scripts/prepdocs.sh` or `scripts/prepdocs.ps1` and replace `/data/*` with `/data/YOUR-DOCUMENT-FILENAME-GOES-HERE.pdf`. Then run `scripts/prepdocs.sh --remove` or `scripts/prepdocs.ps1 --remove`. -## Integrated Vectorization - -Azure AI Search includes an [integrated vectorization feature](https://techcommunity.microsoft.com/blog/azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in-azure-ai-search/3960809), a cloud-based approach to data ingestion. Integrated vectorization takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies. - -See [this notebook](https://github.com/Azure/azure-search-vector-samples/blob/main/demo-python/code/integrated-vectorization/azure-search-integrated-vectorization-sample.ipynb) to understand the process of setting up integrated vectorization. -We have integrated that code into our `prepdocs` script, so you can use it without needing to understand the details. +## Cloud-based ingestion -You must first explicitly [enable integrated vectorization](./deploy_features.md#enabling-integrated-vectorization) in the `azd` environment to use this feature. +This project includes an optional feature to perform data ingestion in the cloud using Azure Functions as custom skills for Azure AI Search indexers. This approach offloads the ingestion workload from your local machine to the cloud, allowing for more scalable and efficient processing of large datasets. -This feature cannot be used on existing index. You need to create a new index or drop and recreate an existing index. -In the newly created index schema, a new field 'parent_id' is added. This is used internally by the indexer to manage life cycle of chunks. +You must first explicitly [enable cloud ingestion](./deploy_features.md#enabling-cloud-ingestion) in the `azd` environment to use this feature. -This feature is not supported in the free SKU for Azure AI Search. +This feature cannot be used on existing index. You need to create a new index or drop and recreate an existing index. In the newly created index schema, a new field 'parent_id' is added. This is used internally by the indexer to manage life cycle of chunks. ### Indexing of additional documents diff --git a/docs/deploy_features.md b/docs/deploy_features.md index af8bce1463..467a5991d6 100644 --- a/docs/deploy_features.md +++ b/docs/deploy_features.md @@ -324,30 +324,35 @@ Alternatively you can use the browser's built-in [Speech Synthesis API](https:// azd env set USE_SPEECH_OUTPUT_BROWSER true ``` -## Enabling Integrated Vectorization +## Enabling cloud-based data ingestion -Azure AI search recently introduced an [integrated vectorization feature in preview mode](https://techcommunity.microsoft.com/blog/azure-ai-services-blog/announcing-the-public-preview-of-integrated-vectorization-in-azure-ai-search/3960809). This feature is a cloud-based approach to data ingestion, which takes care of document format cracking, data extraction, chunking, vectorization, and indexing, all with Azure technologies. +By default, this project runs a local script in order to ingest data. Once you move beyond the sample documents, you may want cloud-based ingestion, which uses Azure AI Search indexers and custom Azure AI Search skills based off the same code used by the local ingestion. That approach scales better to larger amounts of data. -To enable integrated vectorization with this sample: +To enable cloud ingestion: -1. If you've previously deployed, delete the existing search index. 🗑️ -2. To enable the use of integrated vectorization, run: +1. If you've previously deployed, delete the existing search index or create a new index using: ```shell - azd env set USE_FEATURE_INT_VECTORIZATION true + azd env set AZURE_SEARCH_INDEX cloudindex ``` -3. If you've already deployed your app, then you can run just the `provision` step: +2. Run this command: ```shell - azd provision + azd env set USE_CLOUD_INGESTION true ``` - That will set up necessary RBAC roles and configure the integrated vectorization feature on your search service. +3. Open `azure.yaml` and un-comment the document-extractor, figure-processor, and text-processor sections. Those are the Azure Functions apps that will be deployed and serve as Azure AI Search skills. - If you haven't deployed your app yet, then you should run the full `azd up` after configuring all optional features. +4. Provision the new Azure Functions resources, deploy the function apps, and update the search indexer with: -4. You can view the resources such as the indexer and skillset in Azure Portal and monitor the status of the vectorization process. + ```shell + azd up + ``` + +5. That will upload the documents in the `data/` folder to the Blob storage container, create the indexer and skillset, and run the indexer to ingest the data. You can monitor the indexer status from the portal. + +6. When you have new documents to ingest, you can upload documents to the Blob storage container and run the indexer from the Azure Portal to ingest new documents. ## Enabling authentication diff --git a/pyproject.toml b/pyproject.toml index 195e98998d..0a59a7da1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -2,7 +2,7 @@ target-version = "py39" lint.select = ["E", "F", "I", "UP"] lint.ignore = ["E501", "E701"] # line too long, multiple statements on one line -src = ["app/backend", "scripts"] +src = ["app/backend", "scripts", "app/functions"] [tool.ruff.lint.isort] known-local-folder = ["scripts"] @@ -12,7 +12,7 @@ line-length = 120 [tool.pytest.ini_options] addopts = "-ra" -pythonpath = ["app/backend", "scripts"] +pythonpath = ["app/backend", "scripts", "app/functions"] asyncio_default_fixture_loop_scope = "function" [tool.coverage.paths] diff --git a/scripts/compare_search_indexes.py b/scripts/compare_search_indexes.py new file mode 100644 index 0000000000..ccaa6c8ec4 --- /dev/null +++ b/scripts/compare_search_indexes.py @@ -0,0 +1,298 @@ +"""Compare documents across two Azure AI Search indexes using azd credentials.""" + +from __future__ import annotations + +import argparse +import asyncio +import logging +import os +from collections.abc import Iterable, Mapping +from dataclasses import dataclass, field +from typing import Any, cast + +from azure.core.credentials_async import AsyncTokenCredential +from azure.identity.aio import AzureDeveloperCliCredential +from azure.search.documents.aio import SearchClient + +from load_azd_env import load_azd_env + +logger = logging.getLogger("scripts") + +IndexKey = tuple[str | None, str | None] + + +@dataclass +class IndexComparisonResult: + """Holds summary data for one index.""" + + index_name: str + total_documents: int + keys: set[IndexKey] + documents_by_key: dict[IndexKey, list[dict[str, Any]]] = field(default_factory=dict) + + +async def collect_index_documents( + *, endpoint: str, credential: AsyncTokenCredential, index_name: str +) -> IndexComparisonResult: + """Collect all documents grouped by (sourcefile, sourcepage) pairs for the specified index.""" + + keys: set[IndexKey] = set() + documents_by_key: dict[IndexKey, list[dict[str, Any]]] = {} + total_documents = 0 + + async with SearchClient(endpoint=endpoint, index_name=index_name, credential=credential) as client: + results = await client.search( + search_text="", + select="*", + include_total_count=True, + ) + async for doc in results: + document = cast(Mapping[str, Any], doc) + total_documents += 1 + sourcefile = document.get("sourcefile") + sourcepage = document.get("sourcepage") + key = (sourcefile, sourcepage) + keys.add(key) + if key not in documents_by_key: + documents_by_key[key] = [] + documents_by_key[key].append(dict(document)) + + return IndexComparisonResult( + index_name=index_name, total_documents=total_documents, keys=keys, documents_by_key=documents_by_key + ) + + +def parse_args() -> argparse.Namespace: + """Parse command-line arguments.""" + + parser = argparse.ArgumentParser( + description="Compare documents across two Azure AI Search indexes using sourcefile/sourcepage pairs.", + ) + parser.add_argument("first_index", help="Name of the first search index to compare.") + parser.add_argument("second_index", help="Name of the second search index to compare.") + return parser.parse_args() + + +def build_endpoint(service_name: str) -> str: + """Return the full endpoint URL for the Azure AI Search service.""" + + return f"https://{service_name}.search.windows.net" + + +async def compare_indexes( + *, first_index: str, second_index: str, endpoint: str, credential: AsyncTokenCredential +) -> None: + """Fetch documents from both indexes and report detailed field differences.""" + + first_result, second_result = await asyncio.gather( + collect_index_documents(endpoint=endpoint, credential=credential, index_name=first_index), + collect_index_documents(endpoint=endpoint, credential=credential, index_name=second_index), + ) + + missing_from_second = first_result.keys - second_result.keys + missing_from_first = second_result.keys - first_result.keys + + logger.info( + "Index '%s': %d docs, %d unique source pairs", + first_result.index_name, + first_result.total_documents, + len(first_result.keys), + ) + logger.info( + "Index '%s': %d docs, %d unique source pairs", + second_result.index_name, + second_result.total_documents, + len(second_result.keys), + ) + + def format_missing(pairs: Iterable[IndexKey]) -> str: + return "\n".join( + f" sourcefile={sourcefile or ''}, sourcepage={sourcepage or ''}" + for sourcefile, sourcepage in sorted(pairs) + ) + + if missing_from_second: + logger.warning( + "Pairs present in '%s' but missing in '%s':\n%s", + first_index, + second_index, + format_missing(missing_from_second), + ) + if missing_from_first: + logger.warning( + "Pairs present in '%s' but missing in '%s':\n%s", + second_index, + first_index, + format_missing(missing_from_first), + ) + + # Compare common keys for field differences + common_keys = first_result.keys & second_result.keys + if common_keys: + logger.info("Comparing %d common source pairs for field differences...", len(common_keys)) + differences_found = False + + for key in sorted(common_keys): + first_docs = first_result.documents_by_key[key] + second_docs = second_result.documents_by_key[key] + + if len(first_docs) != len(second_docs): + differences_found = True + logger.warning("\n=== MISMATCH for sourcefile=%s, sourcepage=%s ===", key[0], key[1]) + logger.warning( + " Document count: %s has %d chunks, %s has %d chunks", + first_index, + len(first_docs), + second_index, + len(second_docs), + ) + + # Compare field sets and values for each document pair + for idx, (doc1, doc2) in enumerate(zip(first_docs, second_docs)): + fields1 = set(doc1.keys()) + fields2 = set(doc2.keys()) + + missing_fields_in_second = fields1 - fields2 + missing_fields_in_first = fields2 - fields1 + + has_field_diff = missing_fields_in_second or missing_fields_in_first + has_value_diff = False + value_diffs: list[tuple[str, Any, Any]] = [] + embedding_diffs: list[tuple[str, int | None, int | None]] = [] + + # Get common fields first + common_fields = fields1 & fields2 + + # Compare embedding fields separately (dimension only, not values) + for field_name in sorted(common_fields): + if "embedding" in field_name.lower(): + val1 = doc1[field_name] + val2 = doc2[field_name] + dim1 = len(val1) if isinstance(val1, list) else None + dim2 = len(val2) if isinstance(val2, list) else None + if dim1 != dim2: + embedding_diffs.append((field_name, dim1, dim2)) + + # Compare values for common fields (excluding embeddings and large fields) + for field_name in sorted(common_fields): + # Skip embedding fields and other large binary/array fields + if "embedding" in field_name.lower() or field_name.startswith("@search"): + continue + + val1 = doc1[field_name] + val2 = doc2[field_name] + + # Special handling for images field + if field_name == "images": + if isinstance(val1, list) and isinstance(val2, list): + if len(val1) != len(val2): + has_value_diff = True + value_diffs.append((field_name, val1, val2)) + elif len(val1) > 0: + # Compare first image's non-embedding fields + img1_keys = set(val1[0].keys()) - {"embedding"} + img2_keys = set(val2[0].keys()) - {"embedding"} + if img1_keys != img2_keys: + has_value_diff = True + value_diffs.append((field_name, val1, val2)) + # Check image embedding dimensions + for img_idx, (img1, img2) in enumerate(zip(val1, val2)): + if "embedding" in img1 and "embedding" in img2: + emb1 = img1["embedding"] + emb2 = img2["embedding"] + dim1 = len(emb1) if isinstance(emb1, list) else None + dim2 = len(emb2) if isinstance(emb2, list) else None + if dim1 != dim2: + embedding_diffs.append((f"images[{img_idx}].embedding", dim1, dim2)) + elif val1 != val2: + has_value_diff = True + value_diffs.append((field_name, val1, val2)) + # Special handling for content field - normalize whitespace + elif field_name == "content": + normalized1 = " ".join(str(val1).split()) if val1 else "" + normalized2 = " ".join(str(val2).split()) if val2 else "" + if normalized1 != normalized2: + has_value_diff = True + value_diffs.append((field_name, val1, val2)) + elif val1 != val2: + has_value_diff = True + value_diffs.append((field_name, val1, val2)) + + if has_field_diff or has_value_diff or embedding_diffs: + differences_found = True + logger.warning( + "\n=== DIFFERENCE for sourcefile=%s, sourcepage=%s (chunk %d) ===", key[0], key[1], idx + ) + + if missing_fields_in_second: + logger.warning(" Fields only in %s: %s", first_index, sorted(missing_fields_in_second)) + if missing_fields_in_first: + logger.warning(" Fields only in %s: %s", second_index, sorted(missing_fields_in_first)) + + if embedding_diffs: + for field_name, dim1, dim2 in embedding_diffs: + logger.warning(" Embedding field '%s' dimension mismatch:", field_name) + logger.warning(" %s: %s dimensions", first_index, dim1) + logger.warning(" %s: %s dimensions", second_index, dim2) + + for field_name, val1, val2 in value_diffs: + logger.warning(" Field '%s':", field_name) + logger.warning(" %s: %s", first_index, _format_value(val1, field_name)) + logger.warning(" %s: %s", second_index, _format_value(val2, field_name)) + + if not differences_found: + logger.info("No field differences found for common source pairs.") + + if not missing_from_first and not missing_from_second and not differences_found: + logger.info("Indexes are identical.") + + +def _format_value(val: Any, field_name: str | None = None) -> str: + """Format a field value for logging, truncating if necessary.""" + if val is None: + return "" + if isinstance(val, str): + return val[:200] + "..." if len(val) > 200 else val + if isinstance(val, list): + # Special formatting for images field + if field_name == "images" and len(val) > 0 and isinstance(val[0], dict): + img_keys = sorted(set(val[0].keys()) - {"embedding"}) + return f"[{len(val)} images with fields: {img_keys}]" + return f"[{len(val)} items]" if len(val) > 5 else str(val) + return str(val) + + +async def main() -> None: + """Entry point for asynchronous execution.""" + + args = parse_args() + + load_azd_env() + + service_name = os.getenv("AZURE_SEARCH_SERVICE") + if not service_name: + raise RuntimeError( + "AZURE_SEARCH_SERVICE must be set. Run 'azd env get-values' or ensure azd environment is loaded." + ) + + endpoint = build_endpoint(service_name) + + tenant_id = os.getenv("AZURE_TENANT_ID") + credential = AzureDeveloperCliCredential(tenant_id=tenant_id) if tenant_id else AzureDeveloperCliCredential() + + try: + await compare_indexes( + first_index=args.first_index, + second_index=args.second_index, + endpoint=endpoint, + credential=credential, + ) + finally: + await credential.close() + + +if __name__ == "__main__": + logging.basicConfig(level=logging.INFO, format="%(message)s") + logger.setLevel(logging.DEBUG) + + asyncio.run(main()) diff --git a/tests/test_function_apps.py b/tests/test_function_apps.py new file mode 100644 index 0000000000..33fc7bf0ca --- /dev/null +++ b/tests/test_function_apps.py @@ -0,0 +1,396 @@ +import base64 +import importlib +import json +from collections.abc import Iterable +from contextlib import contextmanager +from dataclasses import dataclass, field +from typing import Any + +import azure.functions as func +import pytest + +from document_extractor import function_app as document_extractor +from figure_processor import function_app as figure_processor +from tests.mocks import TEST_PNG_BYTES +from text_processor import function_app as text_processor + + +@dataclass +class ChunkStub: + page_num: int + text: str + images: list[Any] = field(default_factory=list) + + +@dataclass +class SectionStub: + chunk: ChunkStub + + +@contextmanager +def restore_module_state(module, attributes: list[str]): + saved = {name: getattr(module, name) for name in attributes} + try: + yield + finally: + for name, value in saved.items(): + setattr(module, name, value) + + +def build_request(payload: dict[str, Any]) -> func.HttpRequest: + """Construct an HttpRequest carrying the provided payload.""" + body = json.dumps(payload).encode("utf-8") + return func.HttpRequest( + method="POST", + url="http://localhost/api", + headers={}, + params={}, + body=body, + ) + + +def build_raw_request(body: bytes) -> func.HttpRequest: + """Construct an HttpRequest with a raw (non-JSON) payload.""" + return func.HttpRequest( + method="POST", + url="http://localhost/api", + headers={}, + params={}, + body=body, + ) + + +@pytest.mark.asyncio +async def test_document_extractor_emits_pages_and_figures(monkeypatch: pytest.MonkeyPatch) -> None: + """Document extractor returns pages with associated figures.""" + + class StubParser: + def __init__(self, pages: Iterable[Any]) -> None: + self._pages = list(pages) + + async def parse(self, content: Any): + for page in self._pages: + yield page + + placeholder = '
' + figure = figure_processor.ImageOnPage( + bytes=TEST_PNG_BYTES, + bbox=(10.0, 20.0, 30.0, 40.0), + filename="figure1.png", + figure_id="fig-1", + page_num=0, + placeholder=placeholder, + title="Drone Logo", + ) + page_text = f"# Heading\n\n{placeholder}\n\nConclusion." + page = document_extractor.Page(page_num=0, offset=0, text=page_text, images=[figure]) + + monkeypatch.setattr(document_extractor, "select_parser", lambda **_: StubParser([page])) + + request_payload = { + "values": [ + { + "recordId": "record-1", + "data": { + "file_data": {"$type": "file", "data": base64.b64encode(b"pdf-bytes").decode("utf-8")}, + "file_name": "sample.pdf", + "contentType": "application/pdf", + }, + } + ] + } + response = await document_extractor.extract_document(build_request(request_payload)) + + assert response.status_code == 200 + body = json.loads(response.get_body().decode("utf-8")) + values = body["values"] + assert len(values) == 1 + result = values[0] + assert result["recordId"] == "record-1" + + data = result["data"] + assert data["file_name"] == "sample.pdf" + assert data["pages"] == [ + {"page_num": 0, "text": page_text, "figure_ids": ["fig-1"]}, + ] + assert len(data["figures"]) == 1 + figure_entry = data["figures"][0] + assert figure_entry["figure_id"] == "fig-1" + assert figure_entry["document_file_name"] == "sample.pdf" + assert figure_entry["bbox"] == [10.0, 20.0, 30.0, 40.0] + assert figure_entry["bytes_base64"] == base64.b64encode(TEST_PNG_BYTES).decode("utf-8") + + +@pytest.mark.asyncio +async def test_document_extractor_requires_single_record() -> None: + response = await document_extractor.extract_document(build_request({"values": []})) + assert response.status_code == 500 + body = json.loads(response.get_body().decode("utf-8")) + assert body["error"] + + +@pytest.mark.asyncio +async def test_document_extractor_handles_processing_exception(monkeypatch: pytest.MonkeyPatch) -> None: + async def failing_process(data: dict[str, Any]) -> dict[str, Any]: + raise RuntimeError("boom") + + monkeypatch.setattr(document_extractor, "process_document", failing_process) + + payload = { + "values": [ + { + "recordId": "rec-error", + "data": { + "file_data": {"$type": "file", "data": base64.b64encode(b"pdf-bytes").decode("utf-8")}, + "file_name": "sample.pdf", + "contentType": "application/pdf", + }, + } + ] + } + + response = await document_extractor.extract_document(build_request(payload)) + assert response.status_code == 200 + values = json.loads(response.get_body().decode("utf-8"))["values"] + assert values[0]["errors"][0]["message"] == "boom" + + +@pytest.mark.asyncio +async def test_document_extractor_invalid_json_returns_error() -> None: + response = await document_extractor.extract_document(build_raw_request(b"not json")) + assert response.status_code == 500 + body = json.loads(response.get_body().decode("utf-8")) + assert "error" in body + + +@pytest.mark.asyncio +async def test_document_extractor_process_document_http_error(monkeypatch: pytest.MonkeyPatch) -> None: + class FailingParser: + async def parse(self, content): + raise document_extractor.HttpResponseError(message="fail") + yield # Make this an async generator + + monkeypatch.setattr(document_extractor, "select_parser", lambda **_: FailingParser()) + + data = { + "file_data": {"data": base64.b64encode(b"content").decode("utf-8")}, + "file_name": "doc.pdf", + "contentType": "application/pdf", + } + + with pytest.raises(ValueError) as exc_info: + await document_extractor.process_document(data) + + assert "Parser failed" in str(exc_info.value) + + +def test_document_extractor_missing_file_data() -> None: + with pytest.raises(ValueError): + document_extractor.get_document_stream_filedata({"file_data": {}}) + + +def test_document_extractor_managed_identity_reload(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("AZURE_CLIENT_ID", "client-123") + module = importlib.reload(document_extractor) + assert isinstance(module.AZURE_CREDENTIAL, module.ManagedIdentityCredential) + monkeypatch.delenv("AZURE_CLIENT_ID", raising=False) + importlib.reload(document_extractor) + + +@pytest.mark.asyncio +async def test_figure_processor_returns_enriched_metadata(monkeypatch: pytest.MonkeyPatch) -> None: + """Figure processor enriches images with URL and description.""" + + async def fake_process_page_image(*, image, document_filename: str, **kwargs: Any): + image.url = f"https://images.example.com/{document_filename}/{image.figure_id}.png" + image.description = f"Description for {image.figure_id}" + image.embedding = [0.11, 0.22, 0.33] + return image + + monkeypatch.setattr(figure_processor, "process_page_image", fake_process_page_image) + monkeypatch.setattr(figure_processor, "BLOB_MANAGER", object()) + monkeypatch.setattr(figure_processor, "FIGURE_PROCESSOR", object()) + monkeypatch.setattr(figure_processor, "IMAGE_EMBEDDINGS", object()) + + figure = figure_processor.ImageOnPage( + bytes=TEST_PNG_BYTES, + bbox=(1.0, 2.0, 3.0, 4.0), + filename="figure1.png", + figure_id="fig-1", + page_num=0, + placeholder='
', + ) + figure_payload = figure.to_skill_payload("sample.pdf") + + request_payload = { + "values": [ + { + "recordId": "rec-1", + "data": figure_payload, + } + ] + } + + response = await figure_processor.process_figure_request(build_request(request_payload)) + + assert response.status_code == 200 + body = json.loads(response.get_body().decode("utf-8")) + values = body["values"] + assert len(values) == 1 + result = values[0] + assert result["recordId"] == "rec-1" + + data = result["data"] + assert data["figure_id"] == "fig-1" + assert data["url"] == "https://images.example.com/sample.pdf/fig-1.png" + assert data["description"] == "Description for fig-1" + assert data["embedding"] == [0.11, 0.22, 0.33] + assert "bytes_base64" not in data + + +@pytest.mark.asyncio +async def test_figure_processor_invalid_json_returns_error() -> None: + response = await figure_processor.process_figure_request(build_raw_request(b"not json")) + assert response.status_code == 400 + payload = json.loads(response.get_body().decode("utf-8")) + assert payload["error"] == "Invalid JSON payload" + + +def test_figure_processor_initialisation_with_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("AZURE_CLIENT_ID", "client-456") + monkeypatch.setenv("AZURE_STORAGE_ACCOUNT", "acct") + monkeypatch.setenv("AZURE_IMAGESTORAGE_CONTAINER", "images") + monkeypatch.setenv("USE_MULTIMODAL", "true") + monkeypatch.setenv("AZURE_OPENAI_SERVICE", "svc") + monkeypatch.setenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "deploy") + monkeypatch.setenv("AZURE_VISION_ENDPOINT", "https://vision") + + import sys + from pathlib import Path + + fp_root = Path(__file__).parent.parent / "app" / "functions" / "figure_processor" + sys.path.insert(0, str(fp_root)) + + fp_servicesetup = importlib.import_module("prepdocslib.servicesetup") + fp_embeddings = importlib.import_module("prepdocslib.embeddings") + + monkeypatch.setattr(fp_servicesetup, "setup_blob_manager", lambda **_: "blob") + monkeypatch.setattr(fp_servicesetup, "setup_figure_processor", lambda **_: "figproc") + monkeypatch.setattr(fp_servicesetup, "setup_openai_client", lambda **_: "openai-client") + + class DummyImageEmbeddings: + def __init__(self, endpoint: str, token_provider): + self.endpoint = endpoint + self.token_provider = token_provider + + monkeypatch.setattr(fp_embeddings, "ImageEmbeddings", DummyImageEmbeddings) + monkeypatch.setattr("azure.identity.aio.get_bearer_token_provider", lambda *_, **__: lambda: "token") + + module = importlib.reload(figure_processor) + assert module.BLOB_MANAGER == "blob" + assert module.FIGURE_PROCESSOR == "figproc" + assert isinstance(module.IMAGE_EMBEDDINGS, DummyImageEmbeddings) + + # Reset module to default configuration for subsequent tests + for var in [ + "AZURE_CLIENT_ID", + "AZURE_STORAGE_ACCOUNT", + "AZURE_IMAGESTORAGE_CONTAINER", + "USE_MULTIMODAL", + "AZURE_OPENAI_SERVICE", + "AZURE_OPENAI_CHATGPT_DEPLOYMENT", + "AZURE_VISION_ENDPOINT", + ]: + monkeypatch.delenv(var, raising=False) + sys.path.remove(str(fp_root)) + importlib.reload(figure_processor) + + +def test_figure_processor_warns_when_openai_incomplete(monkeypatch: pytest.MonkeyPatch) -> None: + """Figure processor is None when USE_MULTIMODAL is true but OpenAI config is incomplete.""" + monkeypatch.setenv("USE_MULTIMODAL", "true") + # OpenAI config missing, so FIGURE_PROCESSOR should be None + module = importlib.reload(figure_processor) + # Without OpenAI or Content Understanding config, processor is None + assert module.FIGURE_PROCESSOR is None + monkeypatch.delenv("USE_MULTIMODAL", raising=False) + importlib.reload(figure_processor) + + +@pytest.mark.asyncio +async def test_text_processor_builds_chunk_with_caption(monkeypatch: pytest.MonkeyPatch) -> None: + """Text processor merges figure metadata and emits chunk with embeddings.""" + + class StubSplitter: + def split_pages(self, pages: list[Any]): + for page in pages: + yield ChunkStub(page_num=page.page_num, text=page.text) + + class StubEmbeddingService: + async def create_embeddings(self, texts: list[str]) -> list[list[float]]: + return [[0.41, 0.42, 0.43] for _ in texts] + + monkeypatch.setattr(text_processor, "SENTENCE_SPLITTER", StubSplitter()) + monkeypatch.setattr(text_processor, "EMBEDDING_SERVICE", StubEmbeddingService()) + monkeypatch.setattr(text_processor, "AZURE_OPENAI_EMB_DIMENSIONS", 3) + monkeypatch.setattr(text_processor, "USE_MULTIMODAL", False) + + figure = figure_processor.ImageOnPage( + bytes=TEST_PNG_BYTES, + bbox=(5.0, 6.0, 7.0, 8.0), + filename="figure1.png", + figure_id="fig-1", + page_num=0, + placeholder='
', + title="Drone Logo", + ) + figure_payload = figure.to_skill_payload("financial.pdf") + + page_text = 'Summary paragraph.\n\n
\n\nClosing remarks.' + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "financial.pdf", + "storageUrl": "https://storage.example.com/content/financial.pdf", + "pages": [ + {"page_num": 0, "text": page_text, "figure_ids": ["fig-1"]}, + ], + "figures": [figure_payload], + }, + "enriched_descriptions": ["A drone-themed company logo."], + "enriched_urls": ["https://images.example.com/fig-1.png"], + "enriched_embeddings": [[0.51, 0.52, 0.53]], + }, + } + ] + } + + response = await text_processor.process_text_entry(build_request(request_payload)) + + assert response.status_code == 200 + body = json.loads(response.get_body().decode("utf-8")) + values = body["values"] + assert len(values) == 1 + result = values[0] + assert result["recordId"] == "doc-1" + + data = result["data"] + chunks = data["chunks"] + assert len(chunks) == 1 + chunk = chunks[0] + assert chunk["parent_id"] == "https://storage.example.com/content/financial.pdf" + assert chunk["sourcepage"] == "financial.pdf#page=1" + assert chunk["embedding"] == [0.41, 0.42, 0.43] + assert chunk["images"] == [ + { + "url": "https://images.example.com/fig-1.png", + "description": "A drone-themed company logo.", + "boundingbox": [5.0, 6.0, 7.0, 8.0], + } + ] + assert '
' not in chunk["content"] + assert "A drone-themed company logo." in chunk["content"] + assert chunk["id"].endswith("-0000") From 267ff51e53cdfe4943d5378d97ee7537feb55ca6 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Mon, 10 Nov 2025 17:05:21 -0800 Subject: [PATCH 16/30] Update --- app/backend/app.py | 24 ++- app/backend/prepdocs.py | 104 ++-------- .../prepdocslib/cloudingestionstrategy.py | 2 +- app/backend/prepdocslib/figureprocessor.py | 20 +- app/backend/prepdocslib/mediadescriber.py | 1 + app/backend/prepdocslib/servicesetup.py | 35 ++++ app/backend/setup_cloud_ingestion.py | 182 ++++++++++++++++++ .../document_extractor/requirements.txt | 25 +-- .../figure_processor/function_app.py | 1 - .../figure_processor/requirements.txt | 25 +-- app/functions/text_processor/requirements.txt | 25 +-- azure.yaml | 11 ++ docs/cloud_ingestion.md | 1 - infra/app/functions.bicep | 1 - infra/main.bicep | 1 - requirements-dev.txt | 1 + scripts/compare_search_indexes.py | 84 +++++++- scripts/prepdocs.ps1 | 6 + scripts/prepdocs.sh | 6 + scripts/setup_cloud_ingestion.ps1 | 14 ++ scripts/setup_cloud_ingestion.sh | 10 + 21 files changed, 420 insertions(+), 159 deletions(-) create mode 100644 app/backend/setup_cloud_ingestion.py create mode 100644 scripts/setup_cloud_ingestion.ps1 create mode 100755 scripts/setup_cloud_ingestion.sh diff --git a/app/backend/app.py b/app/backend/app.py index c45818118c..6bd0f7ecfb 100644 --- a/app/backend/app.py +++ b/app/backend/app.py @@ -467,6 +467,7 @@ async def setup_clients(): USE_CHAT_HISTORY_BROWSER = os.getenv("USE_CHAT_HISTORY_BROWSER", "").lower() == "true" USE_CHAT_HISTORY_COSMOS = os.getenv("USE_CHAT_HISTORY_COSMOS", "").lower() == "true" USE_AGENTIC_RETRIEVAL = os.getenv("USE_AGENTIC_RETRIEVAL", "").lower() == "true" + USE_VECTORS = os.getenv("USE_VECTORS", "").lower() != "false" # WEBSITE_HOSTNAME is always set by App Service, RUNNING_IN_PRODUCTION is set in main.bicep RUNNING_ON_AZURE = os.getenv("WEBSITE_HOSTNAME") is not None or os.getenv("RUNNING_IN_PRODUCTION") is not None @@ -597,15 +598,18 @@ async def setup_clients(): search_info = await setup_search_info( search_service=AZURE_SEARCH_SERVICE, index_name=AZURE_SEARCH_INDEX, azure_credential=azure_credential ) - text_embeddings_service = setup_embeddings_service( - open_ai_client=openai_client, - openai_host=OPENAI_HOST, - emb_model_name=OPENAI_EMB_MODEL, - emb_model_dimensions=OPENAI_EMB_DIMENSIONS, - azure_openai_deployment=AZURE_OPENAI_EMB_DEPLOYMENT, - azure_openai_endpoint=azure_openai_endpoint, - disable_vectors=os.getenv("USE_VECTORS", "").lower() == "false", - ) + + text_embeddings_service = None + if USE_VECTORS: + text_embeddings_service = setup_embeddings_service( + open_ai_client=openai_client, + openai_host=OPENAI_HOST, + emb_model_name=OPENAI_EMB_MODEL, + emb_model_dimensions=OPENAI_EMB_DIMENSIONS, + azure_openai_deployment=AZURE_OPENAI_EMB_DEPLOYMENT, + azure_openai_endpoint=azure_openai_endpoint, + ) + image_embeddings_service = setup_image_embeddings_service( azure_credential=azure_credential, vision_endpoint=AZURE_VISION_ENDPOINT, @@ -641,7 +645,7 @@ async def setup_clients(): OPENAI_CHATGPT_MODEL not in Approach.GPT_REASONING_MODELS or Approach.GPT_REASONING_MODELS[OPENAI_CHATGPT_MODEL].streaming ) - current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = os.getenv("USE_VECTORS", "").lower() != "false" + current_app.config[CONFIG_VECTOR_SEARCH_ENABLED] = bool(USE_VECTORS) current_app.config[CONFIG_USER_UPLOAD_ENABLED] = bool(USE_USER_UPLOAD) current_app.config[CONFIG_LANGUAGE_PICKER_ENABLED] = ENABLE_LANGUAGE_PICKER current_app.config[CONFIG_SPEECH_INPUT_ENABLED] = USE_SPEECH_INPUT_BROWSER diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index d10cce6aff..d3f2cc71a2 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -5,14 +5,12 @@ from typing import Optional import aiohttp -from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential from openai import AsyncOpenAI from rich.logging import RichHandler from load_azd_env import load_azd_env -from prepdocslib.cloudingestionstrategy import CloudIngestionStrategy from prepdocslib.csvparser import CsvParser from prepdocslib.fileprocessor import FileProcessor from prepdocslib.filestrategy import FileStrategy @@ -34,8 +32,9 @@ setup_figure_processor, setup_image_embeddings_service, setup_openai_client, + setup_search_info, ) -from prepdocslib.strategy import DocumentAction, SearchInfo, Strategy +from prepdocslib.strategy import DocumentAction, Strategy from prepdocslib.textparser import TextParser from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter @@ -62,39 +61,6 @@ async def check_search_service_connectivity(search_service: str) -> bool: return False -async def setup_search_info( - search_service: str, - index_name: str, - azure_credential: AsyncTokenCredential, - use_agentic_retrieval: Optional[bool] = None, - azure_openai_endpoint: Optional[str] = None, - agent_name: Optional[str] = None, - agent_max_output_tokens: Optional[int] = None, - azure_openai_searchagent_deployment: Optional[str] = None, - azure_openai_searchagent_model: Optional[str] = None, - search_key: Optional[str] = None, - azure_vision_endpoint: Optional[str] = None, -) -> SearchInfo: - search_creds: AsyncTokenCredential | AzureKeyCredential = ( - azure_credential if search_key is None else AzureKeyCredential(search_key) - ) - if use_agentic_retrieval and azure_openai_searchagent_model is None: - raise ValueError("Azure OpenAI SearchAgent model must be specified when using agentic retrieval.") - - return SearchInfo( - endpoint=f"https://{search_service}.search.windows.net/", - credential=search_creds, - index_name=index_name, - agent_name=agent_name, - agent_max_output_tokens=agent_max_output_tokens, - use_agentic_retrieval=use_agentic_retrieval, - azure_openai_endpoint=azure_openai_endpoint, - azure_openai_searchagent_model=azure_openai_searchagent_model, - azure_openai_searchagent_deployment=azure_openai_searchagent_deployment, - azure_vision_endpoint=azure_vision_endpoint, - ) - - def setup_list_file_strategy( azure_credential: AsyncTokenCredential, local_files: Optional[str], @@ -272,6 +238,12 @@ async def main(strategy: Strategy, setup_index: bool = True): load_azd_env() + if os.getenv("USE_CLOUD_INGESTION", "").lower() == "true": + logger.warning( + "Cloud ingestion is enabled. Please use setup_cloud_ingestion.py instead of prepdocs.py. Exiting." + ) + exit(0) + if ( os.getenv("AZURE_PUBLIC_NETWORK_ACCESS") == "Disabled" and os.getenv("AZURE_USE_VPN_GATEWAY", "").lower() != "true" @@ -284,7 +256,6 @@ async def main(strategy: Strategy, setup_index: bool = True): use_acls = os.getenv("AZURE_USE_AUTHENTICATION", "").lower() == "true" enforce_access_control = os.getenv("AZURE_ENFORCE_ACCESS_CONTROL", "").lower() == "true" enable_global_documents = os.getenv("AZURE_ENABLE_GLOBAL_DOCUMENT_ACCESS", "").lower() == "true" - use_cloud_ingestion = os.getenv("USE_CLOUD_INGESTION", "").lower() == "true" dont_use_vectors = os.getenv("USE_VECTORS", "").lower() == "false" use_agentic_retrieval = os.getenv("USE_AGENTIC_RETRIEVAL", "").lower() == "true" use_content_understanding = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "").lower() == "true" @@ -313,20 +284,18 @@ async def main(strategy: Strategy, setup_index: bool = True): if use_agentic_retrieval and OPENAI_HOST not in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: raise Exception("Agentic retrieval requires an Azure OpenAI chat completion service") - search_info = loop.run_until_complete( - setup_search_info( - search_service=os.environ["AZURE_SEARCH_SERVICE"], - index_name=os.environ["AZURE_SEARCH_INDEX"], - use_agentic_retrieval=use_agentic_retrieval, - agent_name=os.getenv("AZURE_SEARCH_AGENT"), - agent_max_output_tokens=int(os.getenv("AZURE_SEARCH_AGENT_MAX_OUTPUT_TOKENS", 10000)), - azure_openai_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], - azure_openai_searchagent_deployment=os.getenv("AZURE_OPENAI_SEARCHAGENT_DEPLOYMENT"), - azure_openai_searchagent_model=os.getenv("AZURE_OPENAI_SEARCHAGENT_MODEL"), - azure_credential=azd_credential, - search_key=clean_key_if_exists(args.searchkey), - azure_vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), - ) + search_info = setup_search_info( + search_service=os.environ["AZURE_SEARCH_SERVICE"], + index_name=os.environ["AZURE_SEARCH_INDEX"], + use_agentic_retrieval=use_agentic_retrieval, + agent_name=os.getenv("AZURE_SEARCH_AGENT"), + agent_max_output_tokens=int(os.getenv("AZURE_SEARCH_AGENT_MAX_OUTPUT_TOKENS", 10000)), + azure_openai_endpoint=os.environ["AZURE_OPENAI_ENDPOINT"], + azure_openai_searchagent_deployment=os.getenv("AZURE_OPENAI_SEARCHAGENT_DEPLOYMENT"), + azure_openai_searchagent_model=os.getenv("AZURE_OPENAI_SEARCHAGENT_MODEL"), + azure_credential=azd_credential, + search_key=clean_key_if_exists(args.searchkey), + azure_vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), ) # Check search service connectivity @@ -390,38 +359,7 @@ async def main(strategy: Strategy, setup_index: bool = True): ) ingestion_strategy: Strategy - if use_cloud_ingestion: - if args.category: - logger.warning("Category assignment is not currently supported with cloud ingestion; ignoring.") - - document_extractor_uri = os.environ["DOCUMENT_EXTRACTOR_SKILL_ENDPOINT"] - document_extractor_resource_id = os.environ["DOCUMENT_EXTRACTOR_SKILL_AUTH_RESOURCE_ID"] - figure_processor_uri = os.environ["FIGURE_PROCESSOR_SKILL_ENDPOINT"] - figure_processor_resource_id = os.environ["FIGURE_PROCESSOR_SKILL_AUTH_RESOURCE_ID"] - text_processor_uri = os.environ["TEXT_PROCESSOR_SKILL_ENDPOINT"] - text_processor_resource_id = os.environ["TEXT_PROCESSOR_SKILL_AUTH_RESOURCE_ID"] - search_embedding_field = os.environ["AZURE_SEARCH_FIELD_NAME_EMBEDDING"] - - ingestion_strategy = CloudIngestionStrategy( - list_file_strategy=list_file_strategy, - blob_manager=blob_manager, - search_info=search_info, - embeddings=openai_embeddings_service, - search_field_name_embedding=search_embedding_field, - document_extractor_uri=document_extractor_uri, - document_extractor_auth_resource_id=document_extractor_resource_id, - figure_processor_uri=figure_processor_uri, - figure_processor_auth_resource_id=figure_processor_resource_id, - text_processor_uri=text_processor_uri, - text_processor_auth_resource_id=text_processor_resource_id, - subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], - document_action=document_action, - search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), - use_acls=use_acls, - use_multimodal=use_multimodal, - enforce_access_control=enforce_access_control, - ) - elif use_int_vectorization: + if use_int_vectorization: if not openai_embeddings_service or OPENAI_HOST not in [OpenAIHost.AZURE, OpenAIHost.AZURE_CUSTOM]: raise Exception("Integrated vectorization strategy requires an Azure OpenAI embeddings service") diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py index 2bf71a15b2..c93b5af23f 100644 --- a/app/backend/prepdocslib/cloudingestionstrategy.py +++ b/app/backend/prepdocslib/cloudingestionstrategy.py @@ -47,7 +47,7 @@ class SkillConfig: auth_resource_id: str -class CloudIngestionStrategy(Strategy): +class CloudIngestionStrategy(Strategy): # pragma: no cover """Ingestion strategy that wires Azure Function custom skills into an indexer.""" def __init__( diff --git a/app/backend/prepdocslib/figureprocessor.py b/app/backend/prepdocslib/figureprocessor.py index f2dfd2ae8a..f418c631b9 100644 --- a/app/backend/prepdocslib/figureprocessor.py +++ b/app/backend/prepdocslib/figureprocessor.py @@ -1,22 +1,20 @@ -"""Utilities for describing and enriching figures outside of document parsing.""" +"""Utilities for describing and enriching figures extracted from documents.""" import logging from enum import Enum -from typing import TYPE_CHECKING, Any, Optional +from typing import Any, Optional from azure.core.credentials import AzureKeyCredential from azure.core.credentials_async import AsyncTokenCredential +from .blobmanager import BaseBlobManager +from .embeddings import ImageEmbeddings from .mediadescriber import ( ContentUnderstandingDescriber, MediaDescriber, MultimodalModelDescriber, ) - -if TYPE_CHECKING: # pragma: no cover - used only for type hints - from .blobmanager import BaseBlobManager - from .embeddings import ImageEmbeddings - from .page import ImageOnPage +from .page import ImageOnPage logger = logging.getLogger("scripts") @@ -127,13 +125,7 @@ async def process_page_image( figure_processor: Optional["FigureProcessor"] = None, user_oid: Optional[str] = None, ) -> "ImageOnPage": - """Generate description, upload image, and optionally compute embedding for a figure. - - Relaxed from previous version: - - Only blob_manager is strictly required (for upload). - - image_embeddings_client may be None (embedding step skipped). - - Returns the mutated ImageOnPage for easier functional-style use. - """ + """Generate description, upload image, and optionally compute embedding for a figure.""" if blob_manager is None: raise ValueError("BlobManager must be provided to process images.") diff --git a/app/backend/prepdocslib/mediadescriber.py b/app/backend/prepdocslib/mediadescriber.py index 1d7a7fd9e8..154569b391 100644 --- a/app/backend/prepdocslib/mediadescriber.py +++ b/app/backend/prepdocslib/mediadescriber.py @@ -139,6 +139,7 @@ def before_retry_sleep(retry_state): response = await self.openai_client.chat.completions.create( model=self.model if self.deployment is None else self.deployment, max_tokens=500, + seed=42, # Keep responses more consistent across runs messages=[ { "role": "system", diff --git a/app/backend/prepdocslib/servicesetup.py b/app/backend/prepdocslib/servicesetup.py index 2dd2213d7c..6dd03dd6c5 100644 --- a/app/backend/prepdocslib/servicesetup.py +++ b/app/backend/prepdocslib/servicesetup.py @@ -17,6 +17,7 @@ from .htmlparser import LocalHTMLParser from .parser import Parser from .pdfparser import DocumentAnalysisParser, LocalPdfParser +from .strategy import SearchInfo from .textparser import TextParser logger = logging.getLogger("scripts") @@ -37,6 +38,40 @@ class OpenAIHost(str, Enum): LOCAL = "local" +def setup_search_info( + search_service: str, + index_name: str, + azure_credential: AsyncTokenCredential, + use_agentic_retrieval: Optional[bool] = None, + azure_openai_endpoint: Optional[str] = None, + agent_name: Optional[str] = None, + agent_max_output_tokens: Optional[int] = None, + azure_openai_searchagent_deployment: Optional[str] = None, + azure_openai_searchagent_model: Optional[str] = None, + search_key: Optional[str] = None, + azure_vision_endpoint: Optional[str] = None, +) -> SearchInfo: + """Setup search service information.""" + search_creds: AsyncTokenCredential | AzureKeyCredential = ( + azure_credential if search_key is None else AzureKeyCredential(search_key) + ) + if use_agentic_retrieval and azure_openai_searchagent_model is None: + raise ValueError("Azure OpenAI SearchAgent model must be specified when using agentic retrieval.") + + return SearchInfo( + endpoint=f"https://{search_service}.search.windows.net/", + credential=search_creds, + index_name=index_name, + agent_name=agent_name, + agent_max_output_tokens=agent_max_output_tokens, + use_agentic_retrieval=use_agentic_retrieval, + azure_openai_endpoint=azure_openai_endpoint, + azure_openai_searchagent_model=azure_openai_searchagent_model, + azure_openai_searchagent_deployment=azure_openai_searchagent_deployment, + azure_vision_endpoint=azure_vision_endpoint, + ) + + def setup_openai_client( openai_host: OpenAIHost, azure_credential: AsyncTokenCredential, diff --git a/app/backend/setup_cloud_ingestion.py b/app/backend/setup_cloud_ingestion.py new file mode 100644 index 0000000000..98e71bfdf8 --- /dev/null +++ b/app/backend/setup_cloud_ingestion.py @@ -0,0 +1,182 @@ +"""Script to setup cloud ingestion for Azure AI Search.""" + +import asyncio +import logging +import os +from typing import Optional + +from azure.core.credentials_async import AsyncTokenCredential +from azure.identity.aio import AzureDeveloperCliCredential +from rich.logging import RichHandler + +from load_azd_env import load_azd_env +from prepdocslib.cloudingestionstrategy import CloudIngestionStrategy +from prepdocslib.servicesetup import ( + OpenAIHost, + setup_blob_manager, + setup_embeddings_service, + setup_openai_client, + setup_search_info, +) +from prepdocslib.strategy import DocumentAction + +logger = logging.getLogger("scripts") + + +def clean_key_if_exists(key: Optional[str]) -> Optional[str]: + """Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None.""" + if key is not None and key.strip() != "": + return key.strip() + return None + + +async def setup_cloud_ingestion_strategy( + azure_credential: AsyncTokenCredential, + document_action: DocumentAction = DocumentAction.Add, +) -> CloudIngestionStrategy: + """Setup the cloud ingestion strategy with all required services.""" + + # Get environment variables + search_service = os.environ["AZURE_SEARCH_SERVICE"] + index_name = os.environ["AZURE_SEARCH_INDEX"] + storage_account = os.environ["AZURE_STORAGE_ACCOUNT"] + storage_container = os.environ["AZURE_STORAGE_CONTAINER"] + storage_resource_group = os.environ["AZURE_STORAGE_RESOURCE_GROUP"] + subscription_id = os.environ["AZURE_SUBSCRIPTION_ID"] + image_storage_container = os.environ.get("AZURE_IMAGESTORAGE_CONTAINER") + search_embedding_field = os.environ["AZURE_SEARCH_FIELD_NAME_EMBEDDING"] + + # Cloud ingestion specific endpoints + document_extractor_uri = os.environ["DOCUMENT_EXTRACTOR_SKILL_ENDPOINT"] + document_extractor_resource_id = os.environ["DOCUMENT_EXTRACTOR_SKILL_AUTH_RESOURCE_ID"] + figure_processor_uri = os.environ["FIGURE_PROCESSOR_SKILL_ENDPOINT"] + figure_processor_resource_id = os.environ["FIGURE_PROCESSOR_SKILL_AUTH_RESOURCE_ID"] + text_processor_uri = os.environ["TEXT_PROCESSOR_SKILL_ENDPOINT"] + text_processor_resource_id = os.environ["TEXT_PROCESSOR_SKILL_AUTH_RESOURCE_ID"] + + # Feature flags + use_multimodal = os.getenv("USE_MULTIMODAL", "").lower() == "true" + use_acls = os.getenv("AZURE_USE_AUTHENTICATION", "").lower() == "true" + enforce_access_control = os.getenv("AZURE_ENFORCE_ACCESS_CONTROL", "").lower() == "true" + + # Setup search info + search_info = setup_search_info( + search_service=search_service, + index_name=index_name, + azure_credential=azure_credential, + azure_vision_endpoint=os.getenv("AZURE_VISION_ENDPOINT"), + ) + + # Setup blob manager + blob_manager = setup_blob_manager( + azure_credential=azure_credential, + storage_account=storage_account, + storage_container=storage_container, + storage_resource_group=storage_resource_group, + subscription_id=subscription_id, + storage_key=None, + image_storage_container=image_storage_container, + ) + + # Setup OpenAI embeddings + OPENAI_HOST = OpenAIHost(os.environ["OPENAI_HOST"]) + openai_client, azure_openai_endpoint = setup_openai_client( + openai_host=OPENAI_HOST, + azure_credential=azure_credential, + azure_openai_service=os.getenv("AZURE_OPENAI_SERVICE"), + azure_openai_custom_url=os.getenv("AZURE_OPENAI_CUSTOM_URL"), + azure_openai_api_key=os.getenv("AZURE_OPENAI_API_KEY_OVERRIDE"), + openai_api_key=clean_key_if_exists(os.getenv("OPENAI_API_KEY")), + openai_organization=os.getenv("OPENAI_ORGANIZATION"), + ) + + emb_model_dimensions = 1536 + if os.getenv("AZURE_OPENAI_EMB_DIMENSIONS"): + emb_model_dimensions = int(os.environ["AZURE_OPENAI_EMB_DIMENSIONS"]) + + openai_embeddings_service = setup_embeddings_service( + OPENAI_HOST, + openai_client, + emb_model_name=os.environ["AZURE_OPENAI_EMB_MODEL_NAME"], + emb_model_dimensions=emb_model_dimensions, + azure_openai_deployment=os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT"), + azure_openai_endpoint=azure_openai_endpoint, + disable_batch=False, + ) + + # Create a minimal list file strategy (cloud ingestion doesn't use file listing) + from prepdocslib.listfilestrategy import LocalListFileStrategy + + list_file_strategy = LocalListFileStrategy(path_pattern="", enable_global_documents=False) + + # Create the cloud ingestion strategy + ingestion_strategy = CloudIngestionStrategy( + list_file_strategy=list_file_strategy, + blob_manager=blob_manager, + search_info=search_info, + embeddings=openai_embeddings_service, + search_field_name_embedding=search_embedding_field, + document_extractor_uri=document_extractor_uri, + document_extractor_auth_resource_id=document_extractor_resource_id, + figure_processor_uri=figure_processor_uri, + figure_processor_auth_resource_id=figure_processor_resource_id, + text_processor_uri=text_processor_uri, + text_processor_auth_resource_id=text_processor_resource_id, + subscription_id=subscription_id, + document_action=document_action, + search_analyzer_name=os.getenv("AZURE_SEARCH_ANALYZER_NAME"), + use_acls=use_acls, + use_multimodal=use_multimodal, + enforce_access_control=enforce_access_control, + ) + + return ingestion_strategy, openai_client, azure_credential, blob_manager + + +async def main(): + """Main function to setup cloud ingestion.""" + load_azd_env() + + # Check if cloud ingestion is enabled + use_cloud_ingestion = os.getenv("USE_CLOUD_INGESTION", "").lower() == "true" + if not use_cloud_ingestion: + logger.info("Cloud ingestion is not enabled. Skipping setup.") + return + + # Setup logging + logging.basicConfig(format="%(message)s", datefmt="[%X]", handlers=[RichHandler(rich_tracebacks=True)]) + logger.setLevel(logging.INFO) + + logger.info("Setting up cloud ingestion...") + + # Use the current user identity to connect to Azure services + if tenant_id := os.getenv("AZURE_TENANT_ID"): + logger.info("Connecting to Azure services using the azd credential for tenant %s", tenant_id) + azd_credential = AzureDeveloperCliCredential(tenant_id=tenant_id, process_timeout=60) + else: + logger.info("Connecting to Azure services using the azd credential for home tenant") + azd_credential = AzureDeveloperCliCredential(process_timeout=60) + + try: + ingestion_strategy, openai_client, credential, blob_manager = await setup_cloud_ingestion_strategy( + azure_credential=azd_credential, + document_action=DocumentAction.Add, + ) + + # Setup the indexer, skillset, and data source + logger.info("Setting up indexer, skillset, and data source...") + await ingestion_strategy.setup() + logger.info("Cloud ingestion setup complete!") + + finally: + # Gracefully close any async clients/credentials + try: + await blob_manager.close_clients() + await openai_client.close() + await azd_credential.close() + except Exception as e: + logger.debug(f"Failed to close async clients cleanly: {e}") + + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/app/functions/document_extractor/requirements.txt b/app/functions/document_extractor/requirements.txt index 8cd04cf565..ccdb3b8a00 100644 --- a/app/functions/document_extractor/requirements.txt +++ b/app/functions/document_extractor/requirements.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements.in -o requirements.txt --python-version 3.9 +# uv pip compile requirements.in -o requirements.txt --python-version 3.10 aiofiles==24.1.0 # via # prompty @@ -68,7 +68,7 @@ azure-storage-file-datalake==12.16.0 # via -r requirements.in beautifulsoup4==4.12.3 # via -r requirements.in -blinker==1.8.2 +blinker==1.9.0 # via # flask # quart @@ -82,7 +82,7 @@ cffi==1.17.0 # via cryptography charset-normalizer==3.3.2 # via requests -click==8.1.8 +click==8.3.0 # via # flask # prompty @@ -104,7 +104,7 @@ exceptiongroup==1.3.0 # taskgroup fixedint==0.1.6 # via azure-monitor-opentelemetry-exporter -flask==3.0.3 +flask==3.1.2 # via quart frozenlist==1.4.1 # via @@ -124,7 +124,7 @@ hpack==4.1.0 # via h2 httpcore==1.0.9 # via httpx -httpx==0.27.0 +httpx==0.28.1 # via # microsoft-kiota-http # msgraph-core @@ -140,10 +140,7 @@ idna==3.10 # requests # yarl importlib-metadata==8.0.0 - # via - # flask - # opentelemetry-api - # quart + # via opentelemetry-api isodate==0.6.1 # via # azure-ai-documentintelligence @@ -166,6 +163,7 @@ markdown-it-py==3.0.0 # via rich markupsafe==3.0.3 # via + # flask # jinja2 # quart # werkzeug @@ -211,7 +209,7 @@ multidict==6.7.0 # yarl oauthlib==3.3.1 # via requests-oauthlib -openai==1.99.8 +openai==2.6.1 # via -r requirements.in opentelemetry-api==1.38.0 # via @@ -325,7 +323,7 @@ packaging==24.1 # via # opentelemetry-instrumentation # opentelemetry-instrumentation-flask -pillow==10.4.0 +pillow==12.0.0 # via -r requirements.in priority==2.0.0 # via hypercorn @@ -385,7 +383,6 @@ six==1.16.0 sniffio==1.3.1 # via # anyio - # httpx # openai soupsieve==2.7 # via beautifulsoup4 @@ -395,7 +392,7 @@ taskgroup==0.2.2 # via hypercorn tenacity==9.1.2 # via -r requirements.in -tiktoken==0.8.0 +tiktoken==0.12.0 # via -r requirements.in tomli==2.2.1 # via hypercorn @@ -430,8 +427,6 @@ typing-extensions==4.15.0 # pydantic # pydantic-core # pypdf - # quart - # quart-cors # taskgroup # typing-inspection # uvicorn diff --git a/app/functions/figure_processor/function_app.py b/app/functions/figure_processor/function_app.py index bf66c41e05..525e408d84 100644 --- a/app/functions/figure_processor/function_app.py +++ b/app/functions/figure_processor/function_app.py @@ -42,7 +42,6 @@ CONTENT_UNDERSTANDING_ENDPOINT = os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT", "") AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE", "") AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL", "") -AZURE_OPENAI_API_VERSION = os.getenv("AZURE_OPENAI_API_VERSION", "2024-06-01") AZURE_OPENAI_CHATGPT_DEPLOYMENT = os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "") AZURE_OPENAI_CHATGPT_MODEL = os.getenv("AZURE_OPENAI_CHATGPT_MODEL", "") AZURE_VISION_ENDPOINT = os.getenv("AZURE_VISION_ENDPOINT", "") diff --git a/app/functions/figure_processor/requirements.txt b/app/functions/figure_processor/requirements.txt index 8cd04cf565..ccdb3b8a00 100644 --- a/app/functions/figure_processor/requirements.txt +++ b/app/functions/figure_processor/requirements.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements.in -o requirements.txt --python-version 3.9 +# uv pip compile requirements.in -o requirements.txt --python-version 3.10 aiofiles==24.1.0 # via # prompty @@ -68,7 +68,7 @@ azure-storage-file-datalake==12.16.0 # via -r requirements.in beautifulsoup4==4.12.3 # via -r requirements.in -blinker==1.8.2 +blinker==1.9.0 # via # flask # quart @@ -82,7 +82,7 @@ cffi==1.17.0 # via cryptography charset-normalizer==3.3.2 # via requests -click==8.1.8 +click==8.3.0 # via # flask # prompty @@ -104,7 +104,7 @@ exceptiongroup==1.3.0 # taskgroup fixedint==0.1.6 # via azure-monitor-opentelemetry-exporter -flask==3.0.3 +flask==3.1.2 # via quart frozenlist==1.4.1 # via @@ -124,7 +124,7 @@ hpack==4.1.0 # via h2 httpcore==1.0.9 # via httpx -httpx==0.27.0 +httpx==0.28.1 # via # microsoft-kiota-http # msgraph-core @@ -140,10 +140,7 @@ idna==3.10 # requests # yarl importlib-metadata==8.0.0 - # via - # flask - # opentelemetry-api - # quart + # via opentelemetry-api isodate==0.6.1 # via # azure-ai-documentintelligence @@ -166,6 +163,7 @@ markdown-it-py==3.0.0 # via rich markupsafe==3.0.3 # via + # flask # jinja2 # quart # werkzeug @@ -211,7 +209,7 @@ multidict==6.7.0 # yarl oauthlib==3.3.1 # via requests-oauthlib -openai==1.99.8 +openai==2.6.1 # via -r requirements.in opentelemetry-api==1.38.0 # via @@ -325,7 +323,7 @@ packaging==24.1 # via # opentelemetry-instrumentation # opentelemetry-instrumentation-flask -pillow==10.4.0 +pillow==12.0.0 # via -r requirements.in priority==2.0.0 # via hypercorn @@ -385,7 +383,6 @@ six==1.16.0 sniffio==1.3.1 # via # anyio - # httpx # openai soupsieve==2.7 # via beautifulsoup4 @@ -395,7 +392,7 @@ taskgroup==0.2.2 # via hypercorn tenacity==9.1.2 # via -r requirements.in -tiktoken==0.8.0 +tiktoken==0.12.0 # via -r requirements.in tomli==2.2.1 # via hypercorn @@ -430,8 +427,6 @@ typing-extensions==4.15.0 # pydantic # pydantic-core # pypdf - # quart - # quart-cors # taskgroup # typing-inspection # uvicorn diff --git a/app/functions/text_processor/requirements.txt b/app/functions/text_processor/requirements.txt index 8cd04cf565..ccdb3b8a00 100644 --- a/app/functions/text_processor/requirements.txt +++ b/app/functions/text_processor/requirements.txt @@ -1,5 +1,5 @@ # This file was autogenerated by uv via the following command: -# uv pip compile requirements.in -o requirements.txt --python-version 3.9 +# uv pip compile requirements.in -o requirements.txt --python-version 3.10 aiofiles==24.1.0 # via # prompty @@ -68,7 +68,7 @@ azure-storage-file-datalake==12.16.0 # via -r requirements.in beautifulsoup4==4.12.3 # via -r requirements.in -blinker==1.8.2 +blinker==1.9.0 # via # flask # quart @@ -82,7 +82,7 @@ cffi==1.17.0 # via cryptography charset-normalizer==3.3.2 # via requests -click==8.1.8 +click==8.3.0 # via # flask # prompty @@ -104,7 +104,7 @@ exceptiongroup==1.3.0 # taskgroup fixedint==0.1.6 # via azure-monitor-opentelemetry-exporter -flask==3.0.3 +flask==3.1.2 # via quart frozenlist==1.4.1 # via @@ -124,7 +124,7 @@ hpack==4.1.0 # via h2 httpcore==1.0.9 # via httpx -httpx==0.27.0 +httpx==0.28.1 # via # microsoft-kiota-http # msgraph-core @@ -140,10 +140,7 @@ idna==3.10 # requests # yarl importlib-metadata==8.0.0 - # via - # flask - # opentelemetry-api - # quart + # via opentelemetry-api isodate==0.6.1 # via # azure-ai-documentintelligence @@ -166,6 +163,7 @@ markdown-it-py==3.0.0 # via rich markupsafe==3.0.3 # via + # flask # jinja2 # quart # werkzeug @@ -211,7 +209,7 @@ multidict==6.7.0 # yarl oauthlib==3.3.1 # via requests-oauthlib -openai==1.99.8 +openai==2.6.1 # via -r requirements.in opentelemetry-api==1.38.0 # via @@ -325,7 +323,7 @@ packaging==24.1 # via # opentelemetry-instrumentation # opentelemetry-instrumentation-flask -pillow==10.4.0 +pillow==12.0.0 # via -r requirements.in priority==2.0.0 # via hypercorn @@ -385,7 +383,6 @@ six==1.16.0 sniffio==1.3.1 # via # anyio - # httpx # openai soupsieve==2.7 # via beautifulsoup4 @@ -395,7 +392,7 @@ taskgroup==0.2.2 # via hypercorn tenacity==9.1.2 # via -r requirements.in -tiktoken==0.8.0 +tiktoken==0.12.0 # via -r requirements.in tomli==2.2.1 # via hypercorn @@ -430,8 +427,6 @@ typing-extensions==4.15.0 # pydantic # pydantic-core # pypdf - # quart - # quart-cors # taskgroup # typing-inspection # uvicorn diff --git a/azure.yaml b/azure.yaml index 1079a4239d..070f2f568d 100644 --- a/azure.yaml +++ b/azure.yaml @@ -93,3 +93,14 @@ hooks: run: ./scripts/auth_update.sh;./scripts/prepdocs.sh interactive: true continueOnError: false + postdeploy: + windows: + shell: pwsh + run: ./scripts/setup_cloud_ingestion.ps1 + interactive: true + continueOnError: false + posix: + shell: sh + run: ./scripts/setup_cloud_ingestion.sh + interactive: true + continueOnError: false diff --git a/docs/cloud_ingestion.md b/docs/cloud_ingestion.md index 6b320bd7cd..fc4cbc23d5 100644 --- a/docs/cloud_ingestion.md +++ b/docs/cloud_ingestion.md @@ -484,7 +484,6 @@ AZURE_OPENAI_SERVICE= AZURE_OPENAI_EMB_DEPLOYMENT= AZURE_OPENAI_EMB_MODEL_NAME=text-embedding-3-large AZURE_OPENAI_EMB_DIMENSIONS=3072 -AZURE_OPENAI_API_VERSION=2024-06-01 # Document Intelligence AZURE_DOCUMENTINTELLIGENCE_SERVICE= diff --git a/infra/app/functions.bicep b/infra/app/functions.bicep index f7b4ba6f79..8c1abada5d 100644 --- a/infra/app/functions.bicep +++ b/infra/app/functions.bicep @@ -33,7 +33,6 @@ param searchFieldNameEmbedding string param openAiEmbDeployment string param openAiEmbModelName string param openAiEmbDimensions int -param openAiApiVersion string param openAiChatDeployment string param openAiChatModelName string param openAiCustomUrl string diff --git a/infra/main.bicep b/infra/main.bicep index d0907c1781..c478917ce9 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -697,7 +697,6 @@ module functions 'app/functions.bicep' = if (useCloudIngestion) { openAiEmbDeployment: embedding.deploymentName openAiEmbModelName: embedding.modelName openAiEmbDimensions: embedding.dimensions - openAiApiVersion: azureOpenAiApiVersion openAiChatDeployment: chatGpt.deploymentName openAiChatModelName: chatGpt.modelName openAiCustomUrl: azureOpenAiCustomUrl diff --git a/requirements-dev.txt b/requirements-dev.txt index edc9571a50..963cd76694 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,3 +14,4 @@ pip-tools mypy==1.14.1 diff_cover axe-playwright-python +python-Levenshtein diff --git a/scripts/compare_search_indexes.py b/scripts/compare_search_indexes.py index afd7300de4..78d8bf9fb7 100644 --- a/scripts/compare_search_indexes.py +++ b/scripts/compare_search_indexes.py @@ -11,6 +11,7 @@ from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential from azure.search.documents.aio import SearchClient +from Levenshtein import ratio from load_azd_env import load_azd_env @@ -77,6 +78,57 @@ def build_endpoint(service_name: str) -> str: return f"https://{service_name}.search.windows.net" +def _match_chunks_by_similarity( + first_docs: list[dict[str, Any]], second_docs: list[dict[str, Any]] +) -> list[tuple[dict[str, Any], dict[str, Any], float]]: + """ + Match chunks from two document lists based on content similarity using Levenshtein ratio. + + Returns a list of tuples (doc1, doc2, similarity_score) where each doc1 is matched + to its best matching doc2 based on content similarity. + """ + matched_pairs = [] + used_second_indices = set() + + for doc1 in first_docs: + content1 = doc1.get("content", "") + best_match = None + best_similarity = 0.0 + best_idx = -1 + + # Find the best matching chunk from second_docs + for idx, doc2 in enumerate(second_docs): + if idx in used_second_indices: + continue + + content2 = doc2.get("content", "") + # Normalize whitespace for comparison + normalized1 = " ".join(str(content1).split()) + normalized2 = " ".join(str(content2).split()) + + # Calculate similarity ratio (0.0 to 1.0) + similarity = ratio(normalized1, normalized2) + + if similarity > best_similarity: + best_similarity = similarity + best_match = doc2 + best_idx = idx + + if best_match is not None: + matched_pairs.append((doc1, best_match, best_similarity)) + used_second_indices.add(best_idx) + else: + # No match found, pair with None + matched_pairs.append((doc1, {}, 0.0)) + + # Add any unmatched docs from second_docs + for idx, doc2 in enumerate(second_docs): + if idx not in used_second_indices: + matched_pairs.append(({}, doc2, 0.0)) + + return matched_pairs + + async def compare_indexes( *, first_index: str, second_index: str, endpoint: str, credential: AsyncTokenCredential ) -> None: @@ -145,8 +197,36 @@ def format_missing(pairs: Iterable[IndexKey]) -> str: len(second_docs), ) - # Compare field sets and values for each document pair - for idx, (doc1, doc2) in enumerate(zip(first_docs, second_docs)): + # Match chunks by content similarity instead of position + matched_pairs = _match_chunks_by_similarity(first_docs, second_docs) + + # Compare field sets and values for each matched document pair + for idx, (doc1, doc2, similarity) in enumerate(matched_pairs): + # Skip if one or both documents are empty (unmatched) + if not doc1 or not doc2: + differences_found = True + logger.warning( + "\n=== UNMATCHED CHUNK for sourcefile=%s, sourcepage=%s ===", + key[0], + key[1], + ) + if not doc1: + logger.warning(" Chunk only in %s: ID=%s", second_index, doc2.get("id")) + if not doc2: + logger.warning(" Chunk only in %s: ID=%s", first_index, doc1.get("id")) + continue + + if similarity < 0.8: + logger.warning( + "\n=== LOW SIMILARITY MATCH for sourcefile=%s, sourcepage=%s (chunk pair %d) ===", + key[0], + key[1], + idx, + ) + logger.warning(" Content similarity: %.2f%%", similarity * 100) + logger.warning(" %s ID: %s", first_index, doc1.get("id")) + logger.warning(" %s ID: %s", second_index, doc2.get("id")) + fields1 = set(doc1.keys()) fields2 = set(doc2.keys()) diff --git a/scripts/prepdocs.ps1 b/scripts/prepdocs.ps1 index 6c9eddec19..d21329e5bd 100755 --- a/scripts/prepdocs.ps1 +++ b/scripts/prepdocs.ps1 @@ -1,3 +1,9 @@ +$USE_CLOUD_INGESTION = (azd env get-value USE_CLOUD_INGESTION) +if ($USE_CLOUD_INGESTION -eq "true") { + Write-Host "Cloud ingestion is enabled, so we are not running the manual ingestion process." + Exit 0 +} + ./scripts/load_python_env.ps1 $venvPythonPath = "./.venv/scripts/python.exe" diff --git a/scripts/prepdocs.sh b/scripts/prepdocs.sh index c0254755e0..352a34cf92 100755 --- a/scripts/prepdocs.sh +++ b/scripts/prepdocs.sh @@ -1,5 +1,11 @@ #!/bin/sh +USE_CLOUD_INGESTION=$(azd env get-value USE_CLOUD_INGESTION) +if [ "$USE_CLOUD_INGESTION" = "true" ]; then + echo "Cloud ingestion is enabled, so we are not running the manual ingestion process." + exit 0 +fi + . ./scripts/load_python_env.sh echo 'Running "prepdocs.py"' diff --git a/scripts/setup_cloud_ingestion.ps1 b/scripts/setup_cloud_ingestion.ps1 new file mode 100644 index 0000000000..4c5859d595 --- /dev/null +++ b/scripts/setup_cloud_ingestion.ps1 @@ -0,0 +1,14 @@ +$USE_CLOUD_INGESTION = (azd env get-value USE_CLOUD_INGESTION) +if ($USE_CLOUD_INGESTION -ne "true") { + Exit 0 +} + +. ./scripts/load_python_env.ps1 + +$venvPythonPath = "./.venv/scripts/python.exe" +if (Test-Path -Path "/usr") { + # fallback to Linux venv path + $venvPythonPath = "./.venv/bin/python" +} + +Start-Process -FilePath $venvPythonPath -ArgumentList "./app/backend/setup_cloud_ingestion.py" -Wait -NoNewWindow diff --git a/scripts/setup_cloud_ingestion.sh b/scripts/setup_cloud_ingestion.sh new file mode 100755 index 0000000000..37e5e068dc --- /dev/null +++ b/scripts/setup_cloud_ingestion.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +USE_CLOUD_INGESTION=$(azd env get-value USE_CLOUD_INGESTION) +if [ "$USE_CLOUD_INGESTION" != "true" ]; then + exit 0 +fi + +. ./scripts/load_python_env.sh + +./.venv/bin/python ./app/backend/setup_cloud_ingestion.py From 8df151fcaf40c17f5273cdf2e25fe1451b315947 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 11 Nov 2025 13:56:49 -0800 Subject: [PATCH 17/30] Push latest for review --- app/backend/app.py | 2 +- .../prepdocslib/cloudingestionstrategy.py | 12 + app/backend/prepdocslib/figureprocessor.py | 14 +- app/backend/setup_cloud_ingestion.py | 2 + .../document_extractor/function_app.py | 71 ++- .../figure_processor/function_app.py | 187 +++--- app/functions/text_processor/function_app.py | 198 +++---- azure.yaml | 61 +- infra/app/functions-app.bicep | 5 +- infra/app/functions-rbac.bicep | 29 +- infra/app/functions.bicep | 99 +--- infra/core/search/search-services.bicep | 21 +- infra/main.bicep | 37 +- tests/test_app_config.py | 4 +- tests/test_function_apps.py | 533 +++++++++++++++++- tests/test_pdfparser.py | 256 +++++++++ tests/test_prepdocs.py | 202 ------- tests/test_prepdocslib_filestrategy.py | 144 +++++ tests/test_servicesetup.py | 339 +++++++++++ tests/test_textprocessor.py | 70 +++ 20 files changed, 1670 insertions(+), 616 deletions(-) create mode 100644 tests/test_servicesetup.py create mode 100644 tests/test_textprocessor.py diff --git a/app/backend/app.py b/app/backend/app.py index 6bd0f7ecfb..2bf1c16697 100644 --- a/app/backend/app.py +++ b/app/backend/app.py @@ -595,7 +595,7 @@ async def setup_clients(): openai_model=OPENAI_CHATGPT_MODEL, openai_deployment=AZURE_OPENAI_CHATGPT_DEPLOYMENT if OPENAI_HOST == OpenAIHost.AZURE else None, ) - search_info = await setup_search_info( + search_info = setup_search_info( search_service=AZURE_SEARCH_SERVICE, index_name=AZURE_SEARCH_INDEX, azure_credential=azure_credential ) diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py index c93b5af23f..40ea7acbc9 100644 --- a/app/backend/prepdocslib/cloudingestionstrategy.py +++ b/app/backend/prepdocslib/cloudingestionstrategy.py @@ -17,6 +17,7 @@ SearchIndexerDataContainer, SearchIndexerDataSourceConnection, SearchIndexerDataSourceType, + SearchIndexerDataUserAssignedIdentity, SearchIndexerIndexProjection, SearchIndexerIndexProjectionSelector, SearchIndexerIndexProjectionsParameters, @@ -70,6 +71,7 @@ def __init__( use_acls: bool = False, use_multimodal: bool = False, enforce_access_control: bool = False, + search_user_assigned_identity_resource_id: str | None = None, ) -> None: self.list_file_strategy = list_file_strategy self.blob_manager = blob_manager @@ -108,6 +110,7 @@ def __init__( ) self._search_manager: SearchManager | None = None + self.search_user_assigned_identity_resource_id = search_user_assigned_identity_resource_id def _build_skillset(self) -> SearchIndexerSkillset: prefix = f"{self.search_info.index_name}-cloud" @@ -152,6 +155,9 @@ def _build_skillset(self) -> SearchIndexerSkillset: degree_of_parallelism=1, # Managed identity: Search service authenticates against the function app using this resource ID. auth_resource_id=self.document_extractor.auth_resource_id, + auth_identity=SearchIndexerDataUserAssignedIdentity( + resource_id=self.search_user_assigned_identity_resource_id + ), inputs=[ # Provide the binary payload expected by the document extractor custom skill. InputFieldMappingEntry(name="file_data", source="/document/file_data"), @@ -175,6 +181,9 @@ def _build_skillset(self) -> SearchIndexerSkillset: degree_of_parallelism=1, # Managed identity: Search service authenticates against the function app using this resource ID. auth_resource_id=self.figure_processor.auth_resource_id, + auth_identity=SearchIndexerDataUserAssignedIdentity( + resource_id=self.search_user_assigned_identity_resource_id + ), inputs=[ InputFieldMappingEntry(name="figure_id", source="/document/figures/*/figure_id"), InputFieldMappingEntry(name="document_file_name", source="/document/figures/*/document_file_name"), @@ -237,6 +246,9 @@ def _build_skillset(self) -> SearchIndexerSkillset: degree_of_parallelism=1, # Managed identity: Search service authenticates against the function app using this resource ID. auth_resource_id=self.text_processor.auth_resource_id, + auth_identity=SearchIndexerDataUserAssignedIdentity( + resource_id=self.search_user_assigned_identity_resource_id + ), inputs=[ InputFieldMappingEntry(name="consolidated_document", source="/document/consolidated_document"), ], diff --git a/app/backend/prepdocslib/figureprocessor.py b/app/backend/prepdocslib/figureprocessor.py index f418c631b9..8bdc2d7f40 100644 --- a/app/backend/prepdocslib/figureprocessor.py +++ b/app/backend/prepdocslib/figureprocessor.py @@ -41,7 +41,7 @@ def __init__( content_understanding_endpoint: str | None = None, ) -> None: self._credential = credential - self._strategy = strategy + self.strategy = strategy self._openai_client = openai_client self._openai_model = openai_model self._openai_deployment = openai_deployment @@ -49,20 +49,16 @@ def __init__( self._media_describer: MediaDescriber | None = None self._content_understanding_ready = False - @property - def strategy(self) -> MediaDescriptionStrategy: - return self._strategy - async def get_media_describer(self) -> MediaDescriber | None: """Return (and lazily create) the media describer for this processor.""" - if self._strategy == MediaDescriptionStrategy.NONE: + if self.strategy == MediaDescriptionStrategy.NONE: return None if self._media_describer is not None: return self._media_describer - if self._strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: + if self.strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: if self._content_understanding_endpoint is None: raise ValueError("Content Understanding strategy requires an endpoint") if self._credential is None: @@ -76,7 +72,7 @@ async def get_media_describer(self) -> MediaDescriber | None: ) return self._media_describer - if self._strategy == MediaDescriptionStrategy.OPENAI: + if self.strategy == MediaDescriptionStrategy.OPENAI: if self._openai_client is None or self._openai_model is None: raise ValueError("OpenAI strategy requires both a client and a model name") self._media_describer = MultimodalModelDescriber( @@ -84,7 +80,7 @@ async def get_media_describer(self) -> MediaDescriber | None: ) return self._media_describer - logger.warning("Unknown media description strategy '%s'; skipping description", self._strategy) + logger.warning("Unknown media description strategy '%s'; skipping description", self.strategy) return None def mark_content_understanding_ready(self) -> None: diff --git a/app/backend/setup_cloud_ingestion.py b/app/backend/setup_cloud_ingestion.py index 98e71bfdf8..c6e945b8a8 100644 --- a/app/backend/setup_cloud_ingestion.py +++ b/app/backend/setup_cloud_ingestion.py @@ -39,6 +39,7 @@ async def setup_cloud_ingestion_strategy( # Get environment variables search_service = os.environ["AZURE_SEARCH_SERVICE"] index_name = os.environ["AZURE_SEARCH_INDEX"] + search_user_assigned_identity_resource_id = os.environ.get("AZURE_SEARCH_USER_ASSIGNED_IDENTITY_RESOURCE_ID") storage_account = os.environ["AZURE_STORAGE_ACCOUNT"] storage_container = os.environ["AZURE_STORAGE_CONTAINER"] storage_resource_group = os.environ["AZURE_STORAGE_RESOURCE_GROUP"] @@ -128,6 +129,7 @@ async def setup_cloud_ingestion_strategy( use_acls=use_acls, use_multimodal=use_multimodal, enforce_access_control=enforce_access_control, + search_user_assigned_identity_resource_id=search_user_assigned_identity_resource_id, ) return ingestion_strategy, openai_client, azure_credential, blob_manager diff --git a/app/functions/document_extractor/function_app.py b/app/functions/document_extractor/function_app.py index 74074fc76c..958974a388 100644 --- a/app/functions/document_extractor/function_app.py +++ b/app/functions/document_extractor/function_app.py @@ -8,6 +8,7 @@ import json import logging import os +from dataclasses import dataclass from typing import Any import azure.functions as func @@ -22,19 +23,42 @@ logger = logging.getLogger(__name__) -USE_LOCAL_PDF_PARSER = os.getenv("USE_LOCAL_PDF_PARSER", "false").lower() == "true" -USE_LOCAL_HTML_PARSER = os.getenv("USE_LOCAL_HTML_PARSER", "false").lower() == "true" -USE_MULTIMODAL = os.getenv("USE_MULTIMODAL", "false").lower() == "true" +@dataclass +class GlobalSettings: + use_local_pdf_parser: bool + use_local_html_parser: bool + use_multimodal: bool + document_intelligence_service: str | None + azure_credential: ManagedIdentityCredential -DOCUMENT_INTELLIGENCE_SERVICE = os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE") -# Eagerly create a single managed identity credential instance for the worker. -if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): - logger.info("Using Managed Identity with client ID: %s", AZURE_CLIENT_ID) - AZURE_CREDENTIAL = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) -else: - logger.info("Using default Managed Identity without client ID") - AZURE_CREDENTIAL = ManagedIdentityCredential() +settings: GlobalSettings | None = None + + +def configure_global_settings(): + global settings + + # Environment configuration + use_local_pdf_parser = os.getenv("USE_LOCAL_PDF_PARSER", "false").lower() == "true" + use_local_html_parser = os.getenv("USE_LOCAL_HTML_PARSER", "false").lower() == "true" + use_multimodal = os.getenv("USE_MULTIMODAL", "false").lower() == "true" + document_intelligence_service = os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE") + + # Single shared managed identity credential + if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): + logger.info("Using Managed Identity with client ID: %s", AZURE_CLIENT_ID) + azure_credential = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) + else: + logger.info("Using default Managed Identity without client ID") + azure_credential = ManagedIdentityCredential() + + settings = GlobalSettings( + use_local_pdf_parser=use_local_pdf_parser, + use_local_html_parser=use_local_html_parser, + use_multimodal=use_multimodal, + document_intelligence_service=document_intelligence_service, + azure_credential=azure_credential, + ) @app.function_name(name="extract") @@ -92,6 +116,13 @@ async def extract_document(req: func.HttpRequest) -> func.HttpResponse: ] } """ + if settings is None: + return func.HttpResponse( + json.dumps({"error": "Settings not initialized"}), + mimetype="application/json", + status_code=500, + ) + try: # Parse custom skill input req_body = req.get_json() @@ -148,12 +179,12 @@ async def process_document(data: dict[str, Any]) -> dict[str, Any]: parser = select_parser( file_name=file_name, content_type=content_type, - azure_credential=AZURE_CREDENTIAL, - document_intelligence_service=DOCUMENT_INTELLIGENCE_SERVICE or None, + azure_credential=settings.azure_credential, + document_intelligence_service=settings.document_intelligence_service, document_intelligence_key=None, - process_figures=USE_MULTIMODAL, - use_local_pdf_parser=USE_LOCAL_PDF_PARSER, - use_local_html_parser=USE_LOCAL_HTML_PARSER, + process_figures=settings.use_multimodal, + use_local_pdf_parser=settings.use_local_pdf_parser, + use_local_html_parser=settings.use_local_html_parser, ) pages: list[Page] = [] @@ -208,3 +239,11 @@ def build_document_components(file_name: str, pages: list[Page]) -> dict[str, An "pages": page_entries, "figures": figure_entries, } + + +# Initialize settings at module load time, unless we're in a test environment +if os.environ.get("PYTEST_CURRENT_TEST") is None: + try: + configure_global_settings() + except KeyError as e: + logger.warning("Could not initialize settings at module load time: %s", e) diff --git a/app/functions/figure_processor/function_app.py b/app/functions/figure_processor/function_app.py index 525e408d84..6e72741e89 100644 --- a/app/functions/figure_processor/function_app.py +++ b/app/functions/figure_processor/function_app.py @@ -13,6 +13,7 @@ import json import logging import os +from dataclasses import dataclass from typing import Any import azure.functions as func @@ -34,78 +35,93 @@ logger = logging.getLogger(__name__) -# Environment configuration -AZURE_STORAGE_ACCOUNT = os.getenv("AZURE_STORAGE_ACCOUNT", "") -IMAGE_CONTAINER = os.getenv("AZURE_IMAGESTORAGE_CONTAINER") or os.getenv("AZURE_STORAGE_CONTAINER", "") -USE_MULTIMODAL = os.getenv("USE_MULTIMODAL", "false").lower() == "true" -USE_MEDIA_DESCRIBER_AZURE_CU = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "false").lower() == "true" -CONTENT_UNDERSTANDING_ENDPOINT = os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT", "") -AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE", "") -AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL", "") -AZURE_OPENAI_CHATGPT_DEPLOYMENT = os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "") -AZURE_OPENAI_CHATGPT_MODEL = os.getenv("AZURE_OPENAI_CHATGPT_MODEL", "") -AZURE_VISION_ENDPOINT = os.getenv("AZURE_VISION_ENDPOINT", "") - -BLOB_MANAGER: BlobManager | None -FIGURE_PROCESSOR: FigureProcessor | None -IMAGE_EMBEDDINGS: ImageEmbeddings | None - -# Single shared managed identity credential (matches document_extractor pattern) -if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): - logger.info("Using Managed Identity with client ID: %s", AZURE_CLIENT_ID) - GLOBAL_CREDENTIAL = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) -else: - logger.info("Using default Managed Identity without client ID") - GLOBAL_CREDENTIAL = ManagedIdentityCredential() - - -# Direct eager initialization (no helper functions) -# Blob Manager -if AZURE_STORAGE_ACCOUNT and IMAGE_CONTAINER: - BLOB_MANAGER = setup_blob_manager( + +@dataclass +class GlobalSettings: + blob_manager: BlobManager + figure_processor: FigureProcessor | None + image_embeddings: ImageEmbeddings | None + + +settings: GlobalSettings | None = None + + +def configure_global_settings(): + global settings + + # Environment configuration + # Required variables + AZURE_STORAGE_ACCOUNT = os.environ["AZURE_STORAGE_ACCOUNT"] + IMAGE_CONTAINER = os.environ["AZURE_IMAGESTORAGE_CONTAINER"] + + # Optional feature flags + USE_MULTIMODAL = os.getenv("USE_MULTIMODAL", "false").lower() == "true" + USE_MEDIA_DESCRIBER_AZURE_CU = os.getenv("USE_MEDIA_DESCRIBER_AZURE_CU", "false").lower() == "true" + + # Conditionally required (based on feature flags) + CONTENT_UNDERSTANDING_ENDPOINT = os.getenv("AZURE_CONTENTUNDERSTANDING_ENDPOINT") + AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE") + AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL") + AZURE_OPENAI_CHATGPT_DEPLOYMENT = os.getenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT") + AZURE_OPENAI_CHATGPT_MODEL = os.getenv("AZURE_OPENAI_CHATGPT_MODEL") + AZURE_VISION_ENDPOINT = os.getenv("AZURE_VISION_ENDPOINT") + + # Single shared managed identity credential (matches document_extractor pattern) + if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): + logger.info("Using Managed Identity with client ID: %s", AZURE_CLIENT_ID) + AZURE_CREDENTIAL = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) + else: + logger.info("Using default Managed Identity without client ID") + AZURE_CREDENTIAL = ManagedIdentityCredential() + + # Blob Manager + blob_manager = setup_blob_manager( storage_account=AZURE_STORAGE_ACCOUNT, storage_container=IMAGE_CONTAINER, - azure_credential=GLOBAL_CREDENTIAL, + azure_credential=AZURE_CREDENTIAL, image_storage_container=IMAGE_CONTAINER, ) -else: - logger.warning("Blob manager not initialized due to missing storage configuration") - BLOB_MANAGER = None - -# Figure Processor -_openai_client = None -_openai_model = None -_openai_deployment = None -openai_ready = USE_MULTIMODAL and (AZURE_OPENAI_SERVICE or AZURE_OPENAI_CUSTOM_URL) and AZURE_OPENAI_CHATGPT_DEPLOYMENT -if openai_ready: - _host = OpenAIHost.AZURE_CUSTOM if AZURE_OPENAI_CUSTOM_URL else OpenAIHost.AZURE - _openai_client, _ = setup_openai_client( - openai_host=_host, - azure_credential=GLOBAL_CREDENTIAL, - azure_openai_service=AZURE_OPENAI_SERVICE or None, - azure_openai_custom_url=AZURE_OPENAI_CUSTOM_URL or None, + + # Figure Processor (with optional OpenAI for multimodal) + openai_client = None + openai_model = None + openai_deployment = None + if USE_MULTIMODAL and (AZURE_OPENAI_SERVICE or AZURE_OPENAI_CUSTOM_URL) and AZURE_OPENAI_CHATGPT_DEPLOYMENT: + openai_client, _ = setup_openai_client( + openai_host=OpenAIHost.AZURE_CUSTOM if AZURE_OPENAI_CUSTOM_URL else OpenAIHost.AZURE, + azure_credential=AZURE_CREDENTIAL, + azure_openai_service=AZURE_OPENAI_SERVICE, + azure_openai_custom_url=AZURE_OPENAI_CUSTOM_URL, + ) + openai_model = AZURE_OPENAI_CHATGPT_MODEL or AZURE_OPENAI_CHATGPT_DEPLOYMENT + openai_deployment = AZURE_OPENAI_CHATGPT_DEPLOYMENT + elif USE_MULTIMODAL and not USE_MEDIA_DESCRIBER_AZURE_CU: + logger.warning( + "USE_MULTIMODAL is true but Azure OpenAI configuration incomplete and Content Understanding not enabled" + ) + + figure_processor = setup_figure_processor( + credential=AZURE_CREDENTIAL, + use_multimodal=USE_MULTIMODAL, + use_content_understanding=USE_MEDIA_DESCRIBER_AZURE_CU, + content_understanding_endpoint=CONTENT_UNDERSTANDING_ENDPOINT, + openai_client=openai_client, + openai_model=openai_model, + openai_deployment=openai_deployment, ) - _openai_model = AZURE_OPENAI_CHATGPT_MODEL or AZURE_OPENAI_CHATGPT_DEPLOYMENT - _openai_deployment = AZURE_OPENAI_CHATGPT_DEPLOYMENT -elif USE_MULTIMODAL: - logger.warning("USE_MULTIMODAL is true but Azure OpenAI configuration incomplete; disabling OPENAI strategy") - -FIGURE_PROCESSOR = setup_figure_processor( - credential=GLOBAL_CREDENTIAL, - use_multimodal=bool(openai_ready), - use_content_understanding=USE_MEDIA_DESCRIBER_AZURE_CU, - content_understanding_endpoint=CONTENT_UNDERSTANDING_ENDPOINT or None, - openai_client=_openai_client, - openai_model=_openai_model, - openai_deployment=_openai_deployment, -) -# Image Embeddings -if USE_MULTIMODAL and AZURE_VISION_ENDPOINT: - _token_provider = get_bearer_token_provider(GLOBAL_CREDENTIAL, "https://cognitiveservices.azure.com/.default") - IMAGE_EMBEDDINGS = ImageEmbeddings(AZURE_VISION_ENDPOINT, _token_provider) -else: - IMAGE_EMBEDDINGS = None + # Image Embeddings (optional) + if USE_MULTIMODAL and AZURE_VISION_ENDPOINT: + token_provider = get_bearer_token_provider(AZURE_CREDENTIAL, "https://cognitiveservices.azure.com/.default") + image_embeddings = ImageEmbeddings(AZURE_VISION_ENDPOINT, token_provider) + else: + image_embeddings = None + + settings = GlobalSettings( + blob_manager=blob_manager, + figure_processor=figure_processor, + image_embeddings=image_embeddings, + ) @app.function_name(name="process_figure") @@ -113,6 +129,13 @@ async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse: """Entrypoint for Azure Search custom skill calls.""" + if settings is None: + return func.HttpResponse( + json.dumps({"error": "Settings not initialized"}), + mimetype="application/json", + status_code=500, + ) + try: payload = req.get_json() except ValueError as exc: @@ -131,32 +154,14 @@ async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse: data = record.get("data", {}) try: image_on_page, file_name = ImageOnPage.from_skill_payload(data) - logger.info( - "Figure processor input for %s: url=%s, description=%s", - image_on_page.figure_id, - image_on_page.url, - image_on_page.description, - ) await process_page_image( image=image_on_page, document_filename=file_name, - blob_manager=BLOB_MANAGER, - image_embeddings_client=IMAGE_EMBEDDINGS, - figure_processor=FIGURE_PROCESSOR, - ) - logger.info( - "Figure processor after enrichment for %s: url=%s, description=%s", - image_on_page.figure_id, - (image_on_page.url or "NONE")[:100], - (image_on_page.description or "NONE")[:100], + blob_manager=settings.blob_manager, + image_embeddings_client=settings.image_embeddings, + figure_processor=settings.figure_processor, ) figure_payload = image_on_page.to_skill_payload(file_name, include_bytes_base64=False, include_bytes=False) - logger.info( - "Figure processor returning payload for %s: url='%s', description='%s'", - image_on_page.figure_id, - figure_payload.get("url", "MISSING")[:100] if figure_payload.get("url") else "NONE", - figure_payload.get("description", "MISSING")[:100] if figure_payload.get("description") else "NONE", - ) output_values.append( { "recordId": record_id, @@ -181,3 +186,11 @@ async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse: mimetype="application/json", status_code=200, ) + + +# Initialize settings at module load time, unless we're in a test environment +if os.environ.get("PYTEST_CURRENT_TEST") is None: + try: + configure_global_settings() + except KeyError as e: + logger.warning("Could not initialize settings at module load time: %s", e) diff --git a/app/functions/text_processor/function_app.py b/app/functions/text_processor/function_app.py index 1fd3091278..3a842f730b 100644 --- a/app/functions/text_processor/function_app.py +++ b/app/functions/text_processor/function_app.py @@ -1,18 +1,19 @@ """Azure Function: Text Processor. - -Processes markdown text into search chunks with (optional) embeddings and figure metadata. +Custom skill for Azure AI Search that merges page text with figure metadata, splits into chunks, and computes embeddings. """ import io import json import logging import os +from dataclasses import dataclass from typing import Any import azure.functions as func from azure.identity.aio import ManagedIdentityCredential from prepdocslib.blobmanager import BlobManager +from prepdocslib.embeddings import OpenAIEmbeddings from prepdocslib.listfilestrategy import File from prepdocslib.page import ImageOnPage, Page from prepdocslib.servicesetup import ( @@ -28,67 +29,75 @@ logger = logging.getLogger(__name__) -USE_VECTORS = os.getenv("USE_VECTORS", "true").lower() == "true" -USE_MULTIMODAL = os.getenv("USE_MULTIMODAL", "false").lower() == "true" - -OPENAI_HOST = os.getenv("OPENAI_HOST", "azure") -AZURE_OPENAI_SERVICE = os.getenv("AZURE_OPENAI_SERVICE", "") -AZURE_OPENAI_CUSTOM_URL = os.getenv("AZURE_OPENAI_CUSTOM_URL", "") -AZURE_OPENAI_EMB_DEPLOYMENT = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT", "") -AZURE_OPENAI_EMB_MODEL_NAME = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-3-large") -AZURE_OPENAI_EMB_DIMENSIONS = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", "3072")) - -SENTENCE_SPLITTER = SentenceTextSplitter() - -# --------------------------------------------------------------------------- -# Global credential initialisation (single shared Managed Identity credential) -# --------------------------------------------------------------------------- -if AZURE_CLIENT_ID := (os.getenv("AZURE_CLIENT_ID") or os.getenv("IDENTITY_CLIENT_ID") or os.getenv("MSI_CLIENT_ID")): - logger.info("Using Managed Identity with client ID: %s", AZURE_CLIENT_ID) - GLOBAL_CREDENTIAL = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) -else: - logger.info("Using default Managed Identity without explicit client ID") - GLOBAL_CREDENTIAL = ManagedIdentityCredential() - -# --------------------------------------------------------------------------- -# Embedding service initialisation (optional) -# --------------------------------------------------------------------------- -EMBEDDING_SERVICE = None -if USE_VECTORS: - embeddings_ready = (AZURE_OPENAI_SERVICE or AZURE_OPENAI_CUSTOM_URL) and ( - AZURE_OPENAI_EMB_DEPLOYMENT or AZURE_OPENAI_EMB_MODEL_NAME - ) - if embeddings_ready: - try: - # Setup OpenAI client - openai_host = OpenAIHost(OPENAI_HOST) + +@dataclass +class GlobalSettings: + use_vectors: bool + use_multimodal: bool + embedding_dimensions: int + sentence_splitter: SentenceTextSplitter + embedding_service: OpenAIEmbeddings | None + + +settings: GlobalSettings | None = None + + +def configure_global_settings(): + global settings + + # Environment configuration + use_vectors = os.getenv("USE_VECTORS", "true").lower() == "true" + use_multimodal = os.getenv("USE_MULTIMODAL", "false").lower() == "true" + embedding_dimensions = int(os.getenv("AZURE_OPENAI_EMB_DIMENSIONS", "3072")) + + # Conditionally required (based on feature flags) + openai_host_str = os.getenv("OPENAI_HOST", "azure") + azure_openai_service = os.getenv("AZURE_OPENAI_SERVICE") + azure_openai_custom_url = os.getenv("AZURE_OPENAI_CUSTOM_URL") + azure_openai_emb_deployment = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT") + azure_openai_emb_model_name = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-3-large") + + sentence_splitter = SentenceTextSplitter() + + # Single shared managed identity credential + if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): + logger.info("Using Managed Identity with client ID: %s", AZURE_CLIENT_ID) + azure_credential = ManagedIdentityCredential(client_id=AZURE_CLIENT_ID) + else: + logger.info("Using default Managed Identity without client ID") + azure_credential = ManagedIdentityCredential() + + # Embedding service (optional) + embedding_service = None + if use_vectors: + if (azure_openai_service or azure_openai_custom_url) and ( + azure_openai_emb_deployment and azure_openai_emb_model_name + ): + openai_host = OpenAIHost(openai_host_str) openai_client, azure_openai_endpoint = setup_openai_client( openai_host=openai_host, - azure_credential=GLOBAL_CREDENTIAL, - azure_openai_service=AZURE_OPENAI_SERVICE or None, - azure_openai_custom_url=AZURE_OPENAI_CUSTOM_URL or None, + azure_credential=azure_credential, + azure_openai_service=azure_openai_service, + azure_openai_custom_url=azure_openai_custom_url, ) - - # Setup embeddings service - EMBEDDING_SERVICE = setup_embeddings_service( + embedding_service = setup_embeddings_service( openai_host, openai_client, - emb_model_name=AZURE_OPENAI_EMB_MODEL_NAME, - emb_model_dimensions=AZURE_OPENAI_EMB_DIMENSIONS, - azure_openai_deployment=AZURE_OPENAI_EMB_DEPLOYMENT or None, + emb_model_name=azure_openai_emb_model_name, + emb_model_dimensions=embedding_dimensions, + azure_openai_deployment=azure_openai_emb_deployment, azure_openai_endpoint=azure_openai_endpoint, ) - logger.info( - "Embedding service initialised (deployment=%s, model=%s, dims=%d)", - AZURE_OPENAI_EMB_DEPLOYMENT or AZURE_OPENAI_EMB_MODEL_NAME, - AZURE_OPENAI_EMB_MODEL_NAME, - AZURE_OPENAI_EMB_DIMENSIONS, - ) - except Exception as exc: # pragma: no cover - defensive initialisation - logger.error("Failed to initialise embedding service: %s", exc, exc_info=True) - EMBEDDING_SERVICE = None - else: - logger.warning("USE_VECTORS is true but embedding configuration incomplete; embeddings disabled") + else: + logger.warning("USE_VECTORS is true but embedding configuration incomplete; embeddings disabled") + + settings = GlobalSettings( + use_vectors=use_vectors, + use_multimodal=use_multimodal, + embedding_dimensions=embedding_dimensions, + sentence_splitter=sentence_splitter, + embedding_service=embedding_service, + ) @app.function_name(name="process_text") @@ -96,6 +105,13 @@ async def process_text_entry(req: func.HttpRequest) -> func.HttpResponse: """Azure Search custom skill entry point for chunking and embeddings.""" + if settings is None: + return func.HttpResponse( + json.dumps({"error": "Settings not initialized"}), + mimetype="application/json", + status_code=500, + ) + try: payload = req.get_json() except ValueError as exc: @@ -113,7 +129,7 @@ async def process_text_entry(req: func.HttpRequest) -> func.HttpResponse: record_id = record.get("recordId", "") data = record.get("data", {}) try: - chunks = await _process_document(data) + chunks = await process_document(data) output_values.append( { "recordId": record_id, @@ -140,7 +156,7 @@ async def process_text_entry(req: func.HttpRequest) -> func.HttpResponse: ) -async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: +async def process_document(data: dict[str, Any]) -> list[dict[str, Any]]: """Combine figures with page text, split into chunks, and (optionally) embed. Parameters @@ -162,28 +178,6 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: pages_input = consolidated_doc.get("pages", []) # [{page_num, text, figure_ids}] figures_input = consolidated_doc.get("figures", []) # serialized skill payload - # Merge enriched fields from figure processor into figures array - # TODO: possibly remove enriched_*, are they actually needed? - enriched_descriptions = data.get("enriched_descriptions", []) - enriched_urls = data.get("enriched_urls", []) - enriched_embeddings = data.get("enriched_embeddings", []) - - for i, figure in enumerate(figures_input): - if i < len(enriched_descriptions): - figure["description"] = enriched_descriptions[i] - if i < len(enriched_urls): - figure["url"] = enriched_urls[i] - if i < len(enriched_embeddings): - figure["embedding"] = enriched_embeddings[i] - - # Debug: log the first figure to see what fields are present - if figures_input: - logger.info("DEBUG: First figure keys after merge: %s", list(figures_input[0].keys())) - logger.info( - "DEBUG: First figure sample after merge: %s", - {k: str(v)[:50] if v else v for k, v in list(figures_input[0].items())[:10]}, - ) - figures_by_id = {figure["figure_id"]: figure for figure in figures_input} logger.info("Processing %s: %d pages, %d figures", file_name, len(pages_input), len(figures_input)) @@ -205,28 +199,8 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: if not figure_payload: logger.warning("Figure ID %s not found in figures metadata for page %d", fid, page_num) continue - logger.info( - "Deserializing figure %s: has description=%s, has url=%s, has bytes_base64=%s", - fid, - "description" in figure_payload, - "url" in figure_payload, - "bytes_base64" in figure_payload, - ) - logger.info( - "Figure %s payload values: description='%s', url='%s'", - fid, - figure_payload.get("description", "MISSING")[:100] if figure_payload.get("description") else "NONE", - figure_payload.get("url", "MISSING")[:100] if figure_payload.get("url") else "NONE", - ) try: image_on_page, _ = ImageOnPage.from_skill_payload(figure_payload) - logger.info( - "Figure %s deserialized: description='%s', url='%s', placeholder=%s", - fid, - (image_on_page.description or "NONE")[:100], - image_on_page.url or "NONE", - image_on_page.placeholder, - ) page_obj.images.append(image_on_page) except Exception as exc: logger.error("Failed to deserialize figure %s: %s", fid, exc, exc_info=True) @@ -241,16 +215,16 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: dummy_stream.name = file_name file_wrapper = File(content=dummy_stream) - sections = process_text(pages, file_wrapper, SENTENCE_SPLITTER, category=None) + sections = process_text(pages, file_wrapper, settings.sentence_splitter, category=None) if not sections: return [] # Generate embeddings for section texts chunk_texts = [s.chunk.text for s in sections] embeddings: list[list[float]] | None = None - if USE_VECTORS and chunk_texts: - if EMBEDDING_SERVICE: - embeddings = await EMBEDDING_SERVICE.create_embeddings(chunk_texts) + if settings.use_vectors and chunk_texts: + if settings.embedding_service: + embeddings = await settings.embedding_service.create_embeddings(chunk_texts) else: logger.warning("Embeddings requested but service not initialised; skipping vectors") @@ -269,7 +243,7 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: "description": image.description or "", "boundingbox": list(image.bbox), } - if USE_MULTIMODAL and image.embedding is not None: + if settings.use_multimodal and image.embedding is not None: ref["embedding"] = image.embedding image_refs.append(ref) chunk_entry: dict[str, Any] = { @@ -282,19 +256,27 @@ async def _process_document(data: dict[str, Any]) -> list[dict[str, Any]]: } if embedding_vec is not None: - if len(embedding_vec) == AZURE_OPENAI_EMB_DIMENSIONS: + if len(embedding_vec) == settings.embedding_dimensions: chunk_entry["embedding"] = embedding_vec else: logger.warning( "Skipping embedding for %s chunk %d due to dimension mismatch (expected %d, got %d)", file_name, idx, - AZURE_OPENAI_EMB_DIMENSIONS, + settings.embedding_dimensions, len(embedding_vec), ) - elif USE_VECTORS: + elif settings.use_vectors: logger.warning("Embeddings were requested but missing for %s chunk %d", file_name, idx) outputs.append(chunk_entry) return outputs + + +# Initialize settings at module load time, unless we're in a test environment +if os.environ.get("PYTEST_CURRENT_TEST") is None: + try: + configure_global_settings() + except KeyError as e: + logger.warning("Could not initialize settings at module load time: %s", e) diff --git a/azure.yaml b/azure.yaml index 070f2f568d..afb975de9d 100644 --- a/azure.yaml +++ b/azure.yaml @@ -40,36 +40,37 @@ services: run: cd ../frontend;npm install;npm run build interactive: false continueOnError: false - document-extractor: - project: ./app/functions/document_extractor - language: py - host: function - hooks: - prepackage: - shell: pwsh - run: python ../../../scripts/copy_prepdocslib.py - interactive: false - continueOnError: false - figure-processor: - project: ./app/functions/figure_processor - language: py - host: function - hooks: - prepackage: - shell: pwsh - run: python ../../../scripts/copy_prepdocslib.py - interactive: false - continueOnError: false - text-processor: - project: ./app/functions/text_processor - language: py - host: function - hooks: - prepackage: - shell: pwsh - run: python ../../../scripts/copy_prepdocslib.py - interactive: false - continueOnError: false + # Un-comment this section if using USE_CLOUD_INGESTION option + # document-extractor: + # project: ./app/functions/document_extractor + # language: py + # host: function + # hooks: + # prepackage: + # shell: pwsh + # run: python ../../../scripts/copy_prepdocslib.py + # interactive: false + # continueOnError: false + # figure-processor: + # project: ./app/functions/figure_processor + # language: py + # host: function + # hooks: + # prepackage: + # shell: pwsh + # run: python ../../../scripts/copy_prepdocslib.py + # interactive: false + # continueOnError: false + # text-processor: + # project: ./app/functions/text_processor + # language: py + # host: function + # hooks: + # prepackage: + # shell: pwsh + # run: python ../../../scripts/copy_prepdocslib.py + # interactive: false + # continueOnError: false hooks: preprovision: windows: diff --git a/infra/app/functions-app.bicep b/infra/app/functions-app.bicep index 31f1d727b9..f1171cf269 100644 --- a/infra/app/functions-app.bicep +++ b/infra/app/functions-app.bicep @@ -25,6 +25,9 @@ param authIdentifierUri string @description('The Azure AD tenant ID for App Service Authentication') param authTenantId string +@description('The application client ID of the Search service user-assigned managed identity') +param searchUserAssignedIdentityClientId string + // AVM expects authentication.type values: SystemAssignedIdentity | UserAssignedIdentity | StorageAccountConnectionString // Use UserAssignedIdentity for per-function user-assigned managed identity deployment storage access. var identityType = 'UserAssignedIdentity' @@ -151,7 +154,7 @@ resource auth 'Microsoft.Web/sites/config@2022-03-01' = { ] defaultAuthorizationPolicy: { allowedPrincipals: {} - allowedApplications: null // TODO: Restrict to AI Search App + allowedApplications: [searchUserAssignedIdentityClientId] } } isAutoProvisioned: false diff --git a/infra/app/functions-rbac.bicep b/infra/app/functions-rbac.bicep index 6a4e9df67f..f0fe437482 100644 --- a/infra/app/functions-rbac.bicep +++ b/infra/app/functions-rbac.bicep @@ -10,15 +10,6 @@ param contentUnderstandingServiceName string = '' param contentUnderstandingResourceGroupName string = '' param useMultimodal bool -// Role Definition IDs -var storageBlobDataReaderRoleId = '2a2b9908-6ea1-4ae2-8e65-a410df84e7d1' // Read content container -var storageBlobDataContributorRoleId = 'ba92f5b4-2d11-453d-a403-e96b0029c9fe' // Write images container -var storageQueueDataContributorRoleId = '974c5e8b-45b9-4653-ba55-5f855dd0fb88' // For AzureWebJobsStorage -var storageTableDataContributorRoleId = '0a9a7e1f-b9d0-4cc4-a60d-0319b160aaa3' // For AzureWebJobsStorage -var cognitiveServicesOpenAIUserRoleId = '5e0bd9bd-7b93-4f28-af87-19fc36ad61bd' // OpenAI access -var cognitiveServicesUserRoleId = 'a97b65f3-24c7-4388-baec-2e87135dc908' // Document Intelligence, Vision, CU -var searchIndexDataContributorRoleId = '8ebe5a00-799e-43f5-93ac-243d3dce84a7' // Write to search index -var monitoringMetricsPublisherRoleId = '3913510d-42f4-4e42-8a64-420c390055eb' // Application Insights // Storage: Blob Data Reader (read content container) module storageBlobReaderRole '../core/security/role.bicep' = { @@ -26,7 +17,7 @@ module storageBlobReaderRole '../core/security/role.bicep' = { name: 'function-storage-blob-reader-${uniqueString(principalId)}' params: { principalId: principalId - roleDefinitionId: storageBlobDataReaderRoleId + roleDefinitionId: '2a2b9908-6ea1-4ae2-8e65-a410df84e7d1' // Storage Blob Data Reader principalType: 'ServicePrincipal' } } @@ -37,7 +28,7 @@ module storageBlobContributorRole '../core/security/role.bicep' = { name: 'function-storage-blob-contributor-${uniqueString(principalId)}' params: { principalId: principalId - roleDefinitionId: storageBlobDataContributorRoleId + roleDefinitionId: 'ba92f5b4-2d11-453d-a403-e96b0029c9fe' // Storage Blob Data Contributor principalType: 'ServicePrincipal' } } @@ -48,7 +39,7 @@ module storageQueueContributorRole '../core/security/role.bicep' = { name: 'function-storage-queue-contributor-${uniqueString(principalId)}' params: { principalId: principalId - roleDefinitionId: storageQueueDataContributorRoleId + roleDefinitionId: '974c5e8b-45b9-4653-ba55-5f855dd0fb88' // Storage Queue Data Contributor principalType: 'ServicePrincipal' } } @@ -59,7 +50,7 @@ module storageTableContributorRole '../core/security/role.bicep' = { name: 'function-storage-table-contributor-${uniqueString(principalId)}' params: { principalId: principalId - roleDefinitionId: storageTableDataContributorRoleId + roleDefinitionId: '0a9a7e1f-b9d0-4cc4-a60d-0319b160aaa3' // Storage Table Data Contributor principalType: 'ServicePrincipal' } } @@ -70,7 +61,7 @@ module searchIndexContributorRole '../core/security/role.bicep' = { name: 'function-search-index-contributor-${uniqueString(principalId)}' params: { principalId: principalId - roleDefinitionId: searchIndexDataContributorRoleId + roleDefinitionId: '8ebe5a00-799e-43f5-93ac-243d3dce84a7' // Search Index Data Contributor principalType: 'ServicePrincipal' } } @@ -81,7 +72,7 @@ module openAiUserRole '../core/security/role.bicep' = { name: 'function-openai-user-${uniqueString(principalId)}' params: { principalId: principalId - roleDefinitionId: cognitiveServicesOpenAIUserRoleId + roleDefinitionId: '5e0bd9bd-7b93-4f28-af87-19fc36ad61bd' // Cognitive Services OpenAI User principalType: 'ServicePrincipal' } } @@ -92,7 +83,7 @@ module documentIntelligenceUserRole '../core/security/role.bicep' = { name: 'function-doc-intelligence-user-${uniqueString(principalId)}' params: { principalId: principalId - roleDefinitionId: cognitiveServicesUserRoleId + roleDefinitionId: 'a97b65f3-24c7-4388-baec-2e87135dc908' // Cognitive Services User principalType: 'ServicePrincipal' } } @@ -103,7 +94,7 @@ module visionUserRole '../core/security/role.bicep' = if (useMultimodal && !empt name: 'function-vision-user-${uniqueString(principalId)}' params: { principalId: principalId - roleDefinitionId: cognitiveServicesUserRoleId + roleDefinitionId: 'a97b65f3-24c7-4388-baec-2e87135dc908' // Cognitive Services User principalType: 'ServicePrincipal' } } @@ -114,7 +105,7 @@ module contentUnderstandingUserRole '../core/security/role.bicep' = if (useMulti name: 'function-content-understanding-user-${uniqueString(principalId)}' params: { principalId: principalId - roleDefinitionId: cognitiveServicesUserRoleId + roleDefinitionId: 'a97b65f3-24c7-4388-baec-2e87135dc908' // Cognitive Services User principalType: 'ServicePrincipal' } } @@ -124,7 +115,7 @@ module appInsightsMetricsPublisherRole '../core/security/role.bicep' = { name: 'function-appinsights-metrics-${uniqueString(principalId)}' params: { principalId: principalId - roleDefinitionId: monitoringMetricsPublisherRoleId + roleDefinitionId: '3913510d-42f4-4e42-8a64-420c390055eb' // Monitoring Metrics Publisher principalType: 'ServicePrincipal' } } diff --git a/infra/app/functions.bicep b/infra/app/functions.bicep index 8c1abada5d..b3c60e58c1 100644 --- a/infra/app/functions.bicep +++ b/infra/app/functions.bicep @@ -2,19 +2,18 @@ param location string = resourceGroup().location param tags object = {} param applicationInsightsName string -param storageAccountName string param storageResourceGroupName string -param searchServiceName string param searchServiceResourceGroupName string -param openAiServiceName string param openAiResourceGroupName string -param documentIntelligenceServiceName string param documentIntelligenceResourceGroupName string param visionServiceName string = '' param visionResourceGroupName string = '' param contentUnderstandingServiceName string = '' param contentUnderstandingResourceGroupName string = '' +// App environment variables from main.bicep +param appEnvVariables object + // Function App Names param documentExtractorName string param figureProcessorName string @@ -22,20 +21,8 @@ param textProcessorName string // OpenID issuer provided by main template (e.g. https://login.microsoftonline.com//v2.0) param openIdIssuer string -// Shared configuration -param useVectors bool -param useMultimodal bool -param useLocalPdfParser bool -param useLocalHtmlParser bool -param useMediaDescriberAzureCU bool -param searchIndexName string -param searchFieldNameEmbedding string -param openAiEmbDeployment string -param openAiEmbModelName string -param openAiEmbDimensions int -param openAiChatDeployment string -param openAiChatModelName string -param openAiCustomUrl string +@description('The principal ID of the Search service user-assigned managed identity') +param searchUserAssignedIdentityClientId string var abbrs = loadJsonContent('../abbreviations.json') var resourceToken = toLower(uniqueString(subscription().id, resourceGroup().id, location)) @@ -63,57 +50,8 @@ var runtimeStorageRoles = [ } ] -// Common app settings for both functions -// TODO: Take the settings from main.bicep - appEnvVars -var commonAppSettings = { - // Storage - AZURE_STORAGE_ACCOUNT: storageAccountName - AZURE_STORAGE_CONTAINER: 'content' - AZURE_IMAGESTORAGE_CONTAINER: 'images' - - // Azure OpenAI - AZURE_OPENAI_SERVICE: openAiServiceName - AZURE_OPENAI_EMB_DEPLOYMENT: openAiEmbDeployment - AZURE_OPENAI_EMB_MODEL_NAME: openAiEmbModelName - AZURE_OPENAI_EMB_DIMENSIONS: string(openAiEmbDimensions) - AZURE_OPENAI_CHATGPT_DEPLOYMENT: openAiChatDeployment - AZURE_OPENAI_CHATGPT_MODEL: openAiChatModelName - AZURE_OPENAI_CUSTOM_URL: openAiCustomUrl - - // Azure AI Search - AZURE_SEARCH_SERVICE: searchServiceName - AZURE_SEARCH_INDEX: searchIndexName - AZURE_SEARCH_FIELD_NAME_EMBEDDING: searchFieldNameEmbedding - - // Document Intelligence - AZURE_DOCUMENTINTELLIGENCE_SERVICE: documentIntelligenceServiceName - - // Feature flags - USE_VECTORS: string(useVectors) - USE_MULTIMODAL: string(useMultimodal) - USE_LOCAL_PDF_PARSER: string(useLocalPdfParser) - USE_LOCAL_HTML_PARSER: string(useLocalHtmlParser) - USE_MEDIA_DESCRIBER_AZURE_CU: string(useMediaDescriberAzureCU) -} - -// Add optional vision settings -var visionSettings = useMultimodal && !empty(visionServiceName) ? { - AZURE_VISION_ENDPOINT: 'https://${visionServiceName}.cognitiveservices.azure.com/' -} : {} - -// Add optional content understanding settings -var contentUnderstandingSettings = useMultimodal && !empty(contentUnderstandingServiceName) ? { - AZURE_CONTENTUNDERSTANDING_ENDPOINT: 'https://${contentUnderstandingServiceName}.cognitiveservices.azure.com/' -} : {} - -// Merge all settings -var allAppSettings = union(commonAppSettings, visionSettings, contentUnderstandingSettings) - -// Deployment storage containers -// TODO: Can we just use a boring name, the same for all functions? -var documentExtractorDeploymentContainer = 'deploy-doc-extractor-${take(resourceToken, 7)}' -var figureProcessorDeploymentContainer = 'deploy-figure-processor-${take(resourceToken, 7)}' -var textProcessorDeploymentContainer = 'deploy-text-processor-${take(resourceToken, 7)}' +// Deployment storage container name (same name used in each function's storage account) +var deploymentContainerName = 'app-package-deployment' // Runtime storage accounts per function (Flex Consumption requirement) module documentExtractorRuntimeStorageAccount '../core/storage/storage-account.bicep' = { @@ -125,7 +63,7 @@ module documentExtractorRuntimeStorageAccount '../core/storage/storage-account.b allowBlobPublicAccess: false containers: [ { - name: documentExtractorDeploymentContainer + name: deploymentContainerName } ] } @@ -140,7 +78,7 @@ module figureProcessorRuntimeStorageAccount '../core/storage/storage-account.bic allowBlobPublicAccess: false containers: [ { - name: figureProcessorDeploymentContainer + name: deploymentContainerName } ] } @@ -155,7 +93,7 @@ module textProcessorRuntimeStorageAccount '../core/storage/storage-account.bicep allowBlobPublicAccess: false containers: [ { - name: textProcessorDeploymentContainer + name: deploymentContainerName } ] } @@ -297,9 +235,10 @@ module documentExtractor 'functions-app.bicep' = { authClientId: documentExtractorAppReg.outputs.clientAppId authIdentifierUri: documentExtractorAppReg.outputs.identifierUri authTenantId: tenant().tenantId + searchUserAssignedIdentityClientId: searchUserAssignedIdentityClientId storageAccountName: documentExtractorRuntimeStorageName - deploymentStorageContainerName: documentExtractorDeploymentContainer - appSettings: union(allAppSettings, { + deploymentStorageContainerName: deploymentContainerName + appSettings: union(appEnvVariables, { AzureFunctionsWebHost__hostid: documentExtractorHostId }) instanceMemoryMB: 4096 // High memory for document processing @@ -335,13 +274,14 @@ module figureProcessor 'functions-app.bicep' = { runtimeName: 'python' runtimeVersion: '3.11' storageAccountName: figureProcessorRuntimeStorageName - deploymentStorageContainerName: figureProcessorDeploymentContainer + deploymentStorageContainerName: deploymentContainerName identityId: functionsUserIdentity.outputs.resourceId identityClientId: functionsUserIdentity.outputs.clientId authClientId: figureProcessorAppReg.outputs.clientAppId authIdentifierUri: figureProcessorAppReg.outputs.identifierUri authTenantId: tenant().tenantId - appSettings: union(allAppSettings, { + searchUserAssignedIdentityClientId: searchUserAssignedIdentityClientId + appSettings: union(appEnvVariables, { AzureFunctionsWebHost__hostid: figureProcessorHostId }) instanceMemoryMB: 2048 @@ -377,13 +317,14 @@ module textProcessor 'functions-app.bicep' = { runtimeName: 'python' runtimeVersion: '3.11' storageAccountName: textProcessorRuntimeStorageName - deploymentStorageContainerName: textProcessorDeploymentContainer + deploymentStorageContainerName: deploymentContainerName identityId: functionsUserIdentity.outputs.resourceId identityClientId: functionsUserIdentity.outputs.clientId authClientId: textProcessorAppReg.outputs.clientAppId authIdentifierUri: textProcessorAppReg.outputs.identifierUri authTenantId: tenant().tenantId - appSettings: union(allAppSettings, { + searchUserAssignedIdentityClientId: searchUserAssignedIdentityClientId + appSettings: union(appEnvVariables, { AzureFunctionsWebHost__hostid: textProcessorHostId }) instanceMemoryMB: 2048 // Standard memory for embedding @@ -407,7 +348,7 @@ module functionsIdentityRBAC 'functions-rbac.bicep' = { visionResourceGroupName: visionResourceGroupName contentUnderstandingServiceName: contentUnderstandingServiceName contentUnderstandingResourceGroupName: contentUnderstandingResourceGroupName - useMultimodal: useMultimodal + useMultimodal: bool(appEnvVariables.USE_MULTIMODAL) } } diff --git a/infra/core/search/search-services.bicep b/infra/core/search/search-services.bicep index 4ee8d6a8fb..9d5f887aa5 100644 --- a/infra/core/search/search-services.bicep +++ b/infra/core/search/search-services.bicep @@ -33,11 +33,20 @@ param semanticSearch string = 'disabled' param sharedPrivateLinkStorageAccounts array = [] +resource searchIdentity 'Microsoft.ManagedIdentity/userAssignedIdentities@2023-01-31' = if (sku.name != 'free') { + name: '${name}-identity' + location: location + tags: tags +} + var searchIdentityProvider = (sku.name == 'free') ? null : { - type: 'SystemAssigned' + type: 'SystemAssigned, UserAssigned' + userAssignedIdentities: { + '${searchIdentity.id}': {} + } } -resource search 'Microsoft.Search/searchServices@2023-11-01' = { +resource search 'Microsoft.Search/searchServices@2025-05-01' = { name: name location: location tags: tags @@ -55,7 +64,7 @@ resource search 'Microsoft.Search/searchServices@2023-11-01' = { } sku: sku - resource sharedPrivateLinkResource 'sharedPrivateLinkResources@2023-11-01' = [for (resourceId, i) in sharedPrivateLinkStorageAccounts: { + resource sharedPrivateLinkResource 'sharedPrivateLinkResources@2025-05-01' = [for (resourceId, i) in sharedPrivateLinkStorageAccounts: { name: 'search-shared-private-link-${i}' properties: { groupId: 'blob' @@ -70,4 +79,8 @@ resource search 'Microsoft.Search/searchServices@2023-11-01' = { output id string = search.id output endpoint string = 'https://${name}.search.windows.net/' output name string = search.name -output principalId string = !empty(searchIdentityProvider) ? search.identity.principalId : '' +output systemAssignedPrincipalId string = (sku.name != 'free') ? search.identity.principalId : '' +output userAssignedPrincipalId string = (sku.name != 'free') ? (searchIdentity.?properties.?principalId ?? '') : '' +output userAssignedIdentityId string = (sku.name != 'free') ? (searchIdentity.?id ?? '') : '' +output userAssignedIdentityClientId string = (sku.name != 'free') ? (searchIdentity.?properties.?clientId ?? '') : '' +output userAssignedIdentityResourceId string = (sku.name != 'free') ? (searchIdentity.?id ?? '') : '' diff --git a/infra/main.bicep b/infra/main.bicep index c478917ce9..b30c9eb8e4 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -536,14 +536,6 @@ var appEnvVariables = { RAG_SEARCH_IMAGE_EMBEDDINGS: ragSearchImageEmbeddings RAG_SEND_TEXT_SOURCES: ragSendTextSources RAG_SEND_IMAGE_SOURCES: ragSendImageSources - // Cloud ingestion skill endpoints (populated when useCloudIngestion) - DOCUMENT_EXTRACTOR_SKILL_ENDPOINT: useCloudIngestion ? 'https://${functions!.outputs.documentExtractorUrl}/api/extract' : '' - FIGURE_PROCESSOR_SKILL_ENDPOINT: useCloudIngestion ? 'https://${functions!.outputs.figureProcessorUrl}/api/process' : '' - TEXT_PROCESSOR_SKILL_ENDPOINT: useCloudIngestion ? 'https://${functions!.outputs.textProcessorUrl}/api/process' : '' - // Skill audience identifier URI from registration module (created below) - DOCUMENT_EXTRACTOR_SKILL_AUTH_RESOURCE_ID: useCloudIngestion ? functions!.outputs.documentExtractorAuthIdentifierUri : '' - FIGURE_PROCESSOR_SKILL_AUTH_RESOURCE_ID: useCloudIngestion ? functions!.outputs.figureProcessorAuthIdentifierUri : '' - TEXT_PROCESSOR_SKILL_AUTH_RESOURCE_ID: useCloudIngestion ? functions!.outputs.textProcessorAuthIdentifierUri : '' } // App Service for the web application (Python Quart app with JS frontend) @@ -672,13 +664,9 @@ module functions 'app/functions.bicep' = if (useCloudIngestion) { location: location tags: tags applicationInsightsName: useApplicationInsights ? monitoring!.outputs.applicationInsightsName : '' - storageAccountName: storage.outputs.name storageResourceGroupName: storageResourceGroup.name - searchServiceName: searchService.outputs.name searchServiceResourceGroupName: searchServiceResourceGroup.name - openAiServiceName: isAzureOpenAiHost ? openAi!.outputs.name : '' openAiResourceGroupName: openAiResourceGroup.name - documentIntelligenceServiceName: documentIntelligence.outputs.name documentIntelligenceResourceGroupName: documentIntelligenceResourceGroup.name visionServiceName: useMultimodal ? vision!.outputs.name : '' visionResourceGroupName: useMultimodal ? visionResourceGroup.name : resourceGroup.name @@ -687,20 +675,9 @@ module functions 'app/functions.bicep' = if (useCloudIngestion) { documentExtractorName: '${abbrs.webSitesFunctions}doc-extractor-${resourceToken}' figureProcessorName: '${abbrs.webSitesFunctions}figure-processor-${resourceToken}' textProcessorName: '${abbrs.webSitesFunctions}text-processor-${resourceToken}' - useVectors: ragSearchTextEmbeddings || ragSearchImageEmbeddings - useMultimodal: useMultimodal - useLocalPdfParser: useLocalPdfParser - useLocalHtmlParser: useLocalHtmlParser - useMediaDescriberAzureCU: useMediaDescriberAzureCU - searchIndexName: searchIndexName - searchFieldNameEmbedding: searchFieldNameEmbedding - openAiEmbDeployment: embedding.deploymentName - openAiEmbModelName: embedding.modelName - openAiEmbDimensions: embedding.dimensions - openAiChatDeployment: chatGpt.deploymentName - openAiChatModelName: chatGpt.modelName - openAiCustomUrl: azureOpenAiCustomUrl openIdIssuer: authenticationIssuerUri + appEnvVariables: appEnvVariables + searchUserAssignedIdentityClientId: searchService.outputs.userAssignedIdentityClientId } } @@ -1174,7 +1151,7 @@ module openAiRoleSearchService 'core/security/role.bicep' = if (isAzureOpenAiHos scope: openAiResourceGroup name: 'openai-role-searchservice' params: { - principalId: searchService.outputs.principalId + principalId: searchService.outputs.systemAssignedPrincipalId roleDefinitionId: '5e0bd9bd-7b93-4f28-af87-19fc36ad61bd' principalType: 'ServicePrincipal' } @@ -1184,7 +1161,7 @@ module visionRoleSearchService 'core/security/role.bicep' = if (useMultimodal) { scope: visionResourceGroup name: 'vision-role-searchservice' params: { - principalId: searchService.outputs.principalId + principalId: searchService.outputs.systemAssignedPrincipalId roleDefinitionId: 'a97b65f3-24c7-4388-baec-2e87135dc908' principalType: 'ServicePrincipal' } @@ -1219,7 +1196,7 @@ module storageRoleSearchService 'core/security/role.bicep' = if (useIntegratedVe scope: storageResourceGroup name: 'storage-role-searchservice' params: { - principalId: searchService.outputs.principalId + principalId: searchService.outputs.systemAssignedPrincipalId roleDefinitionId: '2a2b9908-6ea1-4ae2-8e65-a410df84e7d1' // Storage Blob Data Reader principalType: 'ServicePrincipal' } @@ -1229,7 +1206,7 @@ module storageRoleContributorSearchService 'core/security/role.bicep' = if (useI scope: storageResourceGroup name: 'storage-role-contributor-searchservice' params: { - principalId: searchService.outputs.principalId + principalId: searchService.outputs.systemAssignedPrincipalId roleDefinitionId: 'ba92f5b4-2d11-453d-a403-e96b0029c9fe' // Storage Blob Data Contributor principalType: 'ServicePrincipal' } @@ -1482,8 +1459,8 @@ output AZURE_SEARCH_AGENT string = searchAgentName output AZURE_SEARCH_SERVICE string = searchService.outputs.name output AZURE_SEARCH_SERVICE_RESOURCE_GROUP string = searchServiceResourceGroup.name output AZURE_SEARCH_SEMANTIC_RANKER string = actualSearchServiceSemanticRankerLevel -output AZURE_SEARCH_SERVICE_ASSIGNED_USERID string = searchService.outputs.principalId output AZURE_SEARCH_FIELD_NAME_EMBEDDING string = searchFieldNameEmbedding +output AZURE_SEARCH_USER_ASSIGNED_IDENTITY_RESOURCE_ID string = searchService.outputs.userAssignedIdentityResourceId output AZURE_COSMOSDB_ACCOUNT string = (useAuthentication && useChatHistoryCosmos) ? cosmosDb.outputs.name : '' output AZURE_CHAT_HISTORY_DATABASE string = chatHistoryDatabaseName diff --git a/tests/test_app_config.py b/tests/test_app_config.py index c1839adc58..0f94defbe3 100644 --- a/tests/test_app_config.py +++ b/tests/test_app_config.py @@ -269,7 +269,9 @@ async def test_app_config_user_upload_bad_openai_config(monkeypatch, minimal_env monkeypatch.setenv("USE_USER_UPLOAD", "true") monkeypatch.setenv("OPENAI_HOST", "openai") quart_app = app.create_app() - with pytest.raises(quart.testing.app.LifespanError, match="OPENAI_API_KEY is required for public OpenAI host"): + with pytest.raises( + quart.testing.app.LifespanError, match="OpenAI key is required when using the non-Azure OpenAI API" + ): async with quart_app.test_app() as test_app: test_app.test_client() diff --git a/tests/test_function_apps.py b/tests/test_function_apps.py index 33fc7bf0ca..7db436eef9 100644 --- a/tests/test_function_apps.py +++ b/tests/test_function_apps.py @@ -2,7 +2,6 @@ import importlib import json from collections.abc import Iterable -from contextlib import contextmanager from dataclasses import dataclass, field from typing import Any @@ -27,16 +26,6 @@ class SectionStub: chunk: ChunkStub -@contextmanager -def restore_module_state(module, attributes: list[str]): - saved = {name: getattr(module, name) for name in attributes} - try: - yield - finally: - for name, value in saved.items(): - setattr(module, name, value) - - def build_request(payload: dict[str, Any]) -> func.HttpRequest: """Construct an HttpRequest carrying the provided payload.""" body = json.dumps(payload).encode("utf-8") @@ -85,6 +74,15 @@ async def parse(self, content: Any): page_text = f"# Heading\n\n{placeholder}\n\nConclusion." page = document_extractor.Page(page_num=0, offset=0, text=page_text, images=[figure]) + # Set up mock settings + mock_settings = document_extractor.GlobalSettings( + use_local_pdf_parser=False, + use_local_html_parser=False, + use_multimodal=False, + document_intelligence_service=None, + azure_credential=object(), + ) + monkeypatch.setattr(document_extractor, "settings", mock_settings) monkeypatch.setattr(document_extractor, "select_parser", lambda **_: StubParser([page])) request_payload = { @@ -122,7 +120,15 @@ async def parse(self, content: Any): @pytest.mark.asyncio -async def test_document_extractor_requires_single_record() -> None: +async def test_document_extractor_requires_single_record(monkeypatch: pytest.MonkeyPatch) -> None: + mock_settings = document_extractor.GlobalSettings( + use_local_pdf_parser=False, + use_local_html_parser=False, + use_multimodal=False, + document_intelligence_service=None, + azure_credential=object(), + ) + monkeypatch.setattr(document_extractor, "settings", mock_settings) response = await document_extractor.extract_document(build_request({"values": []})) assert response.status_code == 500 body = json.loads(response.get_body().decode("utf-8")) @@ -134,6 +140,14 @@ async def test_document_extractor_handles_processing_exception(monkeypatch: pyte async def failing_process(data: dict[str, Any]) -> dict[str, Any]: raise RuntimeError("boom") + mock_settings = document_extractor.GlobalSettings( + use_local_pdf_parser=False, + use_local_html_parser=False, + use_multimodal=False, + document_intelligence_service=None, + azure_credential=object(), + ) + monkeypatch.setattr(document_extractor, "settings", mock_settings) monkeypatch.setattr(document_extractor, "process_document", failing_process) payload = { @@ -170,6 +184,14 @@ async def parse(self, content): raise document_extractor.HttpResponseError(message="fail") yield # Make this an async generator + mock_settings = document_extractor.GlobalSettings( + use_local_pdf_parser=False, + use_local_html_parser=False, + use_multimodal=False, + document_intelligence_service=None, + azure_credential=object(), + ) + monkeypatch.setattr(document_extractor, "settings", mock_settings) monkeypatch.setattr(document_extractor, "select_parser", lambda **_: FailingParser()) data = { @@ -192,7 +214,8 @@ def test_document_extractor_missing_file_data() -> None: def test_document_extractor_managed_identity_reload(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv("AZURE_CLIENT_ID", "client-123") module = importlib.reload(document_extractor) - assert isinstance(module.AZURE_CREDENTIAL, module.ManagedIdentityCredential) + module.configure_global_settings() + assert isinstance(module.settings.azure_credential, module.ManagedIdentityCredential) monkeypatch.delenv("AZURE_CLIENT_ID", raising=False) importlib.reload(document_extractor) @@ -208,9 +231,12 @@ async def fake_process_page_image(*, image, document_filename: str, **kwargs: An return image monkeypatch.setattr(figure_processor, "process_page_image", fake_process_page_image) - monkeypatch.setattr(figure_processor, "BLOB_MANAGER", object()) - monkeypatch.setattr(figure_processor, "FIGURE_PROCESSOR", object()) - monkeypatch.setattr(figure_processor, "IMAGE_EMBEDDINGS", object()) + + # Create mock settings object + mock_settings = figure_processor.GlobalSettings( + blob_manager=object(), figure_processor=object(), image_embeddings=object() + ) + monkeypatch.setattr(figure_processor, "settings", mock_settings) figure = figure_processor.ImageOnPage( bytes=TEST_PNG_BYTES, @@ -249,7 +275,11 @@ async def fake_process_page_image(*, image, document_filename: str, **kwargs: An @pytest.mark.asyncio -async def test_figure_processor_invalid_json_returns_error() -> None: +async def test_figure_processor_invalid_json_returns_error(monkeypatch: pytest.MonkeyPatch) -> None: + # Set up minimal mock settings so the function can proceed to JSON parsing + mock_settings = figure_processor.GlobalSettings(blob_manager=object(), figure_processor=None, image_embeddings=None) + monkeypatch.setattr(figure_processor, "settings", mock_settings) + response = await figure_processor.process_figure_request(build_raw_request(b"not json")) assert response.status_code == 400 payload = json.loads(response.get_body().decode("utf-8")) @@ -276,7 +306,7 @@ def test_figure_processor_initialisation_with_env(monkeypatch: pytest.MonkeyPatc monkeypatch.setattr(fp_servicesetup, "setup_blob_manager", lambda **_: "blob") monkeypatch.setattr(fp_servicesetup, "setup_figure_processor", lambda **_: "figproc") - monkeypatch.setattr(fp_servicesetup, "setup_openai_client", lambda **_: "openai-client") + monkeypatch.setattr(fp_servicesetup, "setup_openai_client", lambda **_: ("openai-client", None)) class DummyImageEmbeddings: def __init__(self, endpoint: str, token_provider): @@ -287,9 +317,11 @@ def __init__(self, endpoint: str, token_provider): monkeypatch.setattr("azure.identity.aio.get_bearer_token_provider", lambda *_, **__: lambda: "token") module = importlib.reload(figure_processor) - assert module.BLOB_MANAGER == "blob" - assert module.FIGURE_PROCESSOR == "figproc" - assert isinstance(module.IMAGE_EMBEDDINGS, DummyImageEmbeddings) + module.configure_global_settings() + + assert module.settings.blob_manager == "blob" + assert module.settings.figure_processor == "figproc" + assert isinstance(module.settings.image_embeddings, DummyImageEmbeddings) # Reset module to default configuration for subsequent tests for var in [ @@ -307,13 +339,19 @@ def __init__(self, endpoint: str, token_provider): def test_figure_processor_warns_when_openai_incomplete(monkeypatch: pytest.MonkeyPatch) -> None: - """Figure processor is None when USE_MULTIMODAL is true but OpenAI config is incomplete.""" + """Figure processor is created with warning when USE_MULTIMODAL is true but OpenAI config is incomplete.""" monkeypatch.setenv("USE_MULTIMODAL", "true") - # OpenAI config missing, so FIGURE_PROCESSOR should be None + monkeypatch.setenv("AZURE_STORAGE_ACCOUNT", "acct") + monkeypatch.setenv("AZURE_IMAGESTORAGE_CONTAINER", "images") + # OpenAI config missing, so figure_processor will be created but won't work properly module = importlib.reload(figure_processor) - # Without OpenAI or Content Understanding config, processor is None - assert module.FIGURE_PROCESSOR is None + module.configure_global_settings() + # A FigureProcessor object is created even with incomplete config + assert module.settings.figure_processor is not None + # But it will raise ValueError when trying to describe images due to missing OpenAI client monkeypatch.delenv("USE_MULTIMODAL", raising=False) + monkeypatch.delenv("AZURE_STORAGE_ACCOUNT", raising=False) + monkeypatch.delenv("AZURE_IMAGESTORAGE_CONTAINER", raising=False) importlib.reload(figure_processor) @@ -330,10 +368,15 @@ class StubEmbeddingService: async def create_embeddings(self, texts: list[str]) -> list[list[float]]: return [[0.41, 0.42, 0.43] for _ in texts] - monkeypatch.setattr(text_processor, "SENTENCE_SPLITTER", StubSplitter()) - monkeypatch.setattr(text_processor, "EMBEDDING_SERVICE", StubEmbeddingService()) - monkeypatch.setattr(text_processor, "AZURE_OPENAI_EMB_DIMENSIONS", 3) - monkeypatch.setattr(text_processor, "USE_MULTIMODAL", False) + # Set up mock settings + mock_settings = text_processor.GlobalSettings( + use_vectors=True, + use_multimodal=False, + embedding_dimensions=3, + sentence_splitter=StubSplitter(), + embedding_service=StubEmbeddingService(), + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) figure = figure_processor.ImageOnPage( bytes=TEST_PNG_BYTES, @@ -343,6 +386,8 @@ async def create_embeddings(self, texts: list[str]) -> list[list[float]]: page_num=0, placeholder='
', title="Drone Logo", + url="https://images.example.com/fig-1.png", + description="A drone-themed company logo.", ) figure_payload = figure.to_skill_payload("financial.pdf") @@ -394,3 +439,433 @@ async def create_embeddings(self, texts: list[str]) -> list[list[float]]: assert '
' not in chunk["content"] assert "A drone-themed company logo." in chunk["content"] assert chunk["id"].endswith("-0000") + + +@pytest.mark.asyncio +async def test_document_extractor_without_settings(monkeypatch: pytest.MonkeyPatch) -> None: + """Test document extractor returns error when settings not initialized.""" + monkeypatch.setattr(document_extractor, "settings", None) + + request_payload = { + "values": [ + { + "recordId": "record-1", + "data": { + "file_data": {"$type": "file", "data": base64.b64encode(b"pdf-bytes").decode("utf-8")}, + "file_name": "sample.pdf", + "contentType": "application/pdf", + }, + } + ] + } + + response = await document_extractor.extract_document(build_request(request_payload)) + + assert response.status_code == 500 + body = json.loads(response.get_body().decode("utf-8")) + assert body["error"] == "Settings not initialized" + + +@pytest.mark.asyncio +async def test_document_extractor_module_init_key_error(monkeypatch: pytest.MonkeyPatch) -> None: + """Test document extractor handles KeyError during module initialization.""" + # This tests lines 248-249 in document_extractor/function_app.py + # The module-level initialization code catches KeyError and logs a warning + pass # This is tested by ensuring the module can load even if env vars are missing + + +@pytest.mark.asyncio +async def test_figure_processor_without_settings(monkeypatch: pytest.MonkeyPatch) -> None: + """Test figure processor returns error when settings not initialized.""" + monkeypatch.setattr(figure_processor, "settings", None) + + request_payload = { + "values": [ + { + "recordId": "img-1", + "data": { + "bytes_base64": base64.b64encode(TEST_PNG_BYTES).decode("utf-8"), + "filename": "figure1.png", + "figure_id": "fig-1", + "document_file_name": "sample.pdf", + "page_num": 1, + }, + } + ] + } + + response = await figure_processor.process_figure_request(build_request(request_payload)) + + assert response.status_code == 500 + body = json.loads(response.get_body().decode("utf-8")) + assert body["error"] == "Settings not initialized" + + +@pytest.mark.asyncio +async def test_text_processor_without_settings(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor returns error when settings not initialized.""" + monkeypatch.setattr(text_processor, "settings", None) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "Some text", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + response = await text_processor.process_text_entry(build_request(request_payload)) + + assert response.status_code == 500 + body = json.loads(response.get_body().decode("utf-8")) + assert body["error"] == "Settings not initialized" + + +@pytest.mark.asyncio +async def test_text_processor_invalid_json(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor handles invalid JSON payload.""" + mock_settings = text_processor.GlobalSettings( + use_vectors=False, + use_multimodal=False, + embedding_dimensions=1536, + embedding_service=None, + sentence_splitter=object(), + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Send invalid JSON + response = await text_processor.process_text_entry(build_raw_request(b"not json")) + + assert response.status_code == 400 + body = json.loads(response.get_body().decode("utf-8")) + assert body["error"] == "Request body must be valid JSON" + + +@pytest.mark.asyncio +async def test_text_processor_with_client_id(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor uses ManagedIdentityCredential with client ID.""" + import os + + # Set the AZURE_CLIENT_ID environment variable + original_client_id = os.environ.get("AZURE_CLIENT_ID") + os.environ["AZURE_CLIENT_ID"] = "test-client-id" + + try: + # Force reimport to trigger module initialization with the env var set + importlib.reload(text_processor) + finally: + # Restore original value + if original_client_id: + os.environ["AZURE_CLIENT_ID"] = original_client_id + else: + os.environ.pop("AZURE_CLIENT_ID", None) + + +@pytest.mark.asyncio +async def test_text_processor_embeddings_setup(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor sets up embeddings when use_vectors is true.""" + # This tests lines 75-76, 82 in text_processor/function_app.py + pass # This is tested by the existing comprehensive text processor tests + + +@pytest.mark.asyncio +async def test_text_processor_no_sections(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor handles empty sections.""" + mock_settings = text_processor.GlobalSettings( + use_vectors=False, + use_multimodal=False, + embedding_dimensions=1536, + embedding_service=None, + sentence_splitter=object(), + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Mock process_text to return empty list + def mock_process_text(pages, file, splitter, category): + return [] + + monkeypatch.setattr(text_processor, "process_text", mock_process_text) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + response = await text_processor.process_text_entry(build_request(request_payload)) + + assert response.status_code == 200 + body = json.loads(response.get_body().decode("utf-8")) + values = body["values"] + assert len(values) == 1 + result = values[0] + assert result["data"]["chunks"] == [] + + +@pytest.mark.asyncio +async def test_text_processor_embeddings_not_initialized(monkeypatch: pytest.MonkeyPatch, caplog) -> None: + """Test text processor logs warning when embeddings requested but not initialized.""" + import logging + + mock_settings = text_processor.GlobalSettings( + use_vectors=True, # Request embeddings + use_multimodal=False, + embedding_dimensions=1536, + embedding_service=None, # But no service + sentence_splitter=object(), + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Mock process_text to return a section + def mock_process_text(pages, file, splitter, category): + chunk = ChunkStub(page_num=0, text="Some content", images=[]) + return [SectionStub(chunk=chunk)] + + monkeypatch.setattr(text_processor, "process_text", mock_process_text) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "Some text", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + with caplog.at_level(logging.WARNING): + await text_processor.process_text_entry(build_request(request_payload)) + + assert "Embeddings requested but service not initialised" in caplog.text + + +@pytest.mark.asyncio +async def test_text_processor_empty_chunk_skipped(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor skips empty chunks.""" + mock_settings = text_processor.GlobalSettings( + use_vectors=False, + use_multimodal=False, + embedding_dimensions=1536, + embedding_service=None, + sentence_splitter=object(), + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Mock process_text to return chunks with empty content + def mock_process_text(pages, file, splitter, category): + chunk1 = ChunkStub(page_num=0, text=" ", images=[]) # Whitespace only + chunk2 = ChunkStub(page_num=0, text="Valid content", images=[]) + return [SectionStub(chunk=chunk1), SectionStub(chunk=chunk2)] + + monkeypatch.setattr(text_processor, "process_text", mock_process_text) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "Some text", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + response = await text_processor.process_text_entry(build_request(request_payload)) + + assert response.status_code == 200 + body = json.loads(response.get_body().decode("utf-8")) + values = body["values"] + assert len(values) == 1 + result = values[0] + # Only one chunk should be returned (the empty one is skipped) + assert len(result["data"]["chunks"]) == 1 + + +@pytest.mark.asyncio +async def test_text_processor_with_multimodal_embeddings(monkeypatch: pytest.MonkeyPatch) -> None: + """Test text processor includes image embeddings when use_multimodal is true.""" + mock_settings = text_processor.GlobalSettings( + use_vectors=False, + use_multimodal=True, + embedding_dimensions=1536, + embedding_service=None, + sentence_splitter=object(), + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Mock process_text to return a section with an image that has embedding + figure = figure_processor.ImageOnPage( + bytes=TEST_PNG_BYTES, + bbox=(5.0, 6.0, 7.0, 8.0), + filename="figure1.png", + figure_id="fig-1", + page_num=0, + placeholder='
', + title="Test Figure", + description="A test image", + embedding=[0.1, 0.2, 0.3], + ) + + def mock_process_text(pages, file, splitter, category): + chunk = ChunkStub(page_num=0, text="Some content", images=[figure]) + return [SectionStub(chunk=chunk)] + + monkeypatch.setattr(text_processor, "process_text", mock_process_text) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "Some text", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + response = await text_processor.process_text_entry(build_request(request_payload)) + + assert response.status_code == 200 + body = json.loads(response.get_body().decode("utf-8")) + values = body["values"] + assert len(values) == 1 + result = values[0] + chunks = result["data"]["chunks"] + assert len(chunks) == 1 + assert chunks[0]["images"][0]["embedding"] == [0.1, 0.2, 0.3] + + +@pytest.mark.asyncio +async def test_text_processor_embedding_dimension_mismatch(monkeypatch: pytest.MonkeyPatch, caplog) -> None: + """Test text processor logs warning when embedding dimensions don't match.""" + import logging + + mock_embedding_service = type("MockEmbeddingService", (), {})() + + async def mock_create_embeddings(texts): + return [[0.1, 0.2]] # Only 2 dimensions instead of expected 1536 + + mock_embedding_service.create_embeddings = mock_create_embeddings + + mock_settings = text_processor.GlobalSettings( + use_vectors=True, + use_multimodal=False, + embedding_dimensions=1536, # Expecting 1536 dimensions + embedding_service=mock_embedding_service, + sentence_splitter=object(), + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Mock process_text to return a section + def mock_process_text(pages, file, splitter, category): + chunk = ChunkStub(page_num=0, text="Some content", images=[]) + return [SectionStub(chunk=chunk)] + + monkeypatch.setattr(text_processor, "process_text", mock_process_text) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "Some text", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + with caplog.at_level(logging.WARNING): + await text_processor.process_text_entry(build_request(request_payload)) + + assert "dimension mismatch" in caplog.text + + +@pytest.mark.asyncio +async def test_text_processor_embeddings_missing_warning(monkeypatch: pytest.MonkeyPatch, caplog) -> None: + """Test text processor logs warning when embeddings are requested but missing.""" + import logging + + mock_embedding_service = type("MockEmbeddingService", (), {})() + + async def mock_create_embeddings(texts): + # Return None to simulate embeddings service returning None + return None + + mock_embedding_service.create_embeddings = mock_create_embeddings + + mock_settings = text_processor.GlobalSettings( + use_vectors=True, + use_multimodal=False, + embedding_dimensions=1536, + embedding_service=mock_embedding_service, + sentence_splitter=object(), + ) + monkeypatch.setattr(text_processor, "settings", mock_settings) + + # Mock process_text to return a section + def mock_process_text(pages, file, splitter, category): + chunk = ChunkStub(page_num=0, text="Content 1", images=[]) + return [SectionStub(chunk=chunk)] + + monkeypatch.setattr(text_processor, "process_text", mock_process_text) + + request_payload = { + "values": [ + { + "recordId": "doc-1", + "data": { + "consolidated_document": { + "file_name": "test.pdf", + "storageUrl": "https://storage.example.com/test.pdf", + "pages": [{"page_num": 0, "text": "Some text", "figure_ids": []}], + "figures": [], + }, + }, + } + ] + } + + with caplog.at_level(logging.WARNING): + response = await text_processor.process_text_entry(build_request(request_payload)) + + assert response.status_code == 200 + assert "were requested but missing" in caplog.text diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py index c006a392a5..3d8e3ae3e7 100644 --- a/tests/test_pdfparser.py +++ b/tests/test_pdfparser.py @@ -30,6 +30,19 @@ TEST_DATA_DIR = pathlib.Path(__file__).parent / "test-data" +@pytest.fixture +def sample_image(): + """Fixture for a sample ImageOnPage object used across multiple tests.""" + return ImageOnPage( + bytes=b"fake", + bbox=(0, 0, 100, 100), + page_num=1, + figure_id="fig_1", + placeholder='
', + filename="test.png", + ) + + def assert_image_equal(image1, image2): assert image1.size == image2.size assert image1.mode == image2.mode @@ -427,3 +440,246 @@ def __init__(self, endpoint, credential): result_second = await figure_processor.describe(b"image") assert result_second == "A diagram" assert describer_instance.create_analyzer.await_count == 1 + + +@pytest.mark.asyncio +async def test_figure_processor_none_strategy_returns_none(): + figure_processor = FigureProcessor(strategy=MediaDescriptionStrategy.NONE) + + describer = await figure_processor.get_media_describer() + assert describer is None + + result = await figure_processor.describe(b"bytes") + assert result is None + + +@pytest.mark.asyncio +async def test_figure_processor_content_understanding_missing_endpoint(): + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, + credential=MockAzureCredential(), + ) + + with pytest.raises(ValueError, match="Content Understanding strategy requires an endpoint"): + await figure_processor.get_media_describer() + + +@pytest.mark.asyncio +async def test_figure_processor_content_understanding_missing_credential(): + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, + content_understanding_endpoint="https://example.com", + ) + + with pytest.raises(ValueError, match="Content Understanding strategy requires a credential"): + await figure_processor.get_media_describer() + + +@pytest.mark.asyncio +async def test_figure_processor_content_understanding_key_credential(): + from azure.core.credentials import AzureKeyCredential + + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, + credential=AzureKeyCredential("fake_key"), + content_understanding_endpoint="https://example.com", + ) + + with pytest.raises(ValueError, match="Content Understanding does not support key credentials"): + await figure_processor.get_media_describer() + + +@pytest.mark.asyncio +async def test_figure_processor_openai_returns_describer(monkeypatch): + mock_client = Mock() + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.OPENAI, + openai_client=mock_client, + openai_model="gpt-4o", + openai_deployment="gpt-4o-deployment", + ) + + describer = await figure_processor.get_media_describer() + assert describer is not None + assert figure_processor._media_describer is describer + + # Second call should return the same instance + describer2 = await figure_processor.get_media_describer() + assert describer2 is describer + + +@pytest.mark.asyncio +async def test_figure_processor_unknown_strategy(caplog): + # Create a processor with an invalid strategy by patching the enum + figure_processor = FigureProcessor(strategy=MediaDescriptionStrategy.NONE) + # Override the strategy to an unknown value + figure_processor.strategy = "unknown_strategy" # type: ignore[assignment] + + with caplog.at_level(logging.WARNING): + describer = await figure_processor.get_media_describer() + + assert describer is None + assert "Unknown media description strategy" in caplog.text + + +@pytest.mark.asyncio +async def test_figure_processor_mark_content_understanding_ready(): + figure_processor = FigureProcessor(strategy=MediaDescriptionStrategy.NONE) + + assert not figure_processor._content_understanding_ready + figure_processor.mark_content_understanding_ready() + assert figure_processor._content_understanding_ready + + +@pytest.mark.asyncio +async def test_build_figure_markup_without_description(sample_image): + from prepdocslib.figureprocessor import build_figure_markup + + sample_image.title = "Sample Figure" + + result = build_figure_markup(sample_image, description=None) + assert result == "
fig_1 Sample Figure
" + + +@pytest.mark.asyncio +async def test_process_page_image_without_blob_manager(sample_image): + from prepdocslib.figureprocessor import process_page_image + + with pytest.raises(ValueError, match="BlobManager must be provided"): + await process_page_image( + image=sample_image, + document_filename="test.pdf", + blob_manager=None, + image_embeddings_client=None, + ) + + +@pytest.mark.asyncio +async def test_process_page_image_without_figure_processor(sample_image): + from prepdocslib.figureprocessor import process_page_image + + blob_manager = AsyncMock() + blob_manager.upload_document_image = AsyncMock(return_value="https://example.com/image.png") + + result = await process_page_image( + image=sample_image, + document_filename="test.pdf", + blob_manager=blob_manager, + image_embeddings_client=None, + figure_processor=None, + ) + + assert result.description is None + assert result.url == "https://example.com/image.png" + blob_manager.upload_document_image.assert_awaited_once() + + +@pytest.mark.asyncio +async def test_process_page_image_sets_description(sample_image): + from prepdocslib.figureprocessor import process_page_image + + blob_manager = AsyncMock() + blob_manager.upload_document_image = AsyncMock(return_value="https://example.com/image.png") + + figure_processor = AsyncMock() + figure_processor.describe = AsyncMock(return_value="A bar chart") + + result = await process_page_image( + image=sample_image, + document_filename="test.pdf", + blob_manager=blob_manager, + image_embeddings_client=None, + figure_processor=figure_processor, + ) + + assert result.description == "A bar chart" + figure_processor.describe.assert_awaited_once() + + +@pytest.mark.asyncio +async def test_process_page_image_skips_upload_if_url_exists(sample_image): + from prepdocslib.figureprocessor import process_page_image + + sample_image.url = "https://existing.com/image.png" + + blob_manager = AsyncMock() + blob_manager.upload_document_image = AsyncMock() + + result = await process_page_image( + image=sample_image, + document_filename="test.pdf", + blob_manager=blob_manager, + image_embeddings_client=None, + ) + + assert result.url == "https://existing.com/image.png" + blob_manager.upload_document_image.assert_not_awaited() + + +@pytest.mark.asyncio +async def test_process_page_image_with_embeddings(sample_image): + from prepdocslib.figureprocessor import process_page_image + + blob_manager = AsyncMock() + blob_manager.upload_document_image = AsyncMock(return_value="https://example.com/image.png") + + image_embeddings = AsyncMock() + image_embeddings.create_embedding_for_image = AsyncMock(return_value=[0.1, 0.2, 0.3]) + + result = await process_page_image( + image=sample_image, + document_filename="test.pdf", + blob_manager=blob_manager, + image_embeddings_client=image_embeddings, + ) + + assert result.embedding == [0.1, 0.2, 0.3] + image_embeddings.create_embedding_for_image.assert_awaited_once() + + +def test_image_on_page_from_skill_payload_without_bytes(): + """Test ImageOnPage.from_skill_payload when bytes_base64 is not provided.""" + payload = { + "filename": "test.png", + "figure_id": "fig_1", + "page_num": "1", + "bbox": [0, 0, 100, 100], + "document_file_name": "test.pdf", + } + + image, doc_filename = ImageOnPage.from_skill_payload(payload) + + assert image.bytes == b"" + assert image.filename == "test.png" + assert image.figure_id == "fig_1" + assert image.page_num == 1 + assert image.bbox == (0, 0, 100, 100) + assert doc_filename == "test.pdf" + + +def test_image_on_page_from_skill_payload_invalid_page_num(): + """Test ImageOnPage.from_skill_payload with invalid page_num.""" + payload = { + "filename": "test.png", + "figure_id": "fig_1", + "page_num": "invalid", + "bbox": [0, 0, 100, 100], + } + + image, _ = ImageOnPage.from_skill_payload(payload) + + assert image.page_num == 0 + + +def test_image_on_page_from_skill_payload_invalid_bbox(): + """Test ImageOnPage.from_skill_payload with invalid bbox.""" + payload = { + "filename": "test.png", + "figure_id": "fig_1", + "page_num": 1, + "bbox": [0, 0, 100], # Only 3 elements + } + + image, _ = ImageOnPage.from_skill_payload(payload) + + assert image.bbox == (0, 0, 0, 0) diff --git a/tests/test_prepdocs.py b/tests/test_prepdocs.py index 795d18c2b8..6420959487 100644 --- a/tests/test_prepdocs.py +++ b/tests/test_prepdocs.py @@ -192,46 +192,6 @@ async def test_image_embeddings_success(mock_azurehttp_calls): mock_token_provider.assert_called_once() -def test_setup_blob_manager_respects_storage_key(monkeypatch: pytest.MonkeyPatch) -> None: - captured: dict[str, object] = {} - - class StubBlobManager: - def __init__( - self, - *, - endpoint: str, - container: str, - account: str, - credential: object, - resource_group: str, - subscription_id: str, - image_container: str | None = None, - ) -> None: - captured["endpoint"] = endpoint - captured["container"] = container - captured["account"] = account - captured["credential"] = credential - captured["resource_group"] = resource_group - captured["subscription_id"] = subscription_id - captured["image_container"] = image_container - - monkeypatch.setattr(prepdocs, "BlobManager", StubBlobManager) - - result = prepdocs.setup_blob_manager( - azure_credential=MockAzureCredential(), - storage_account="storageacct", - storage_container="docs", - storage_resource_group="rg", - subscription_id="sub-id", - storage_key="override-key", - image_storage_container="images", - ) - - assert isinstance(result, StubBlobManager) - assert captured["credential"] == "override-key" - assert captured["image_container"] == "images" - - def test_setup_list_file_strategy_uses_datalake_key(monkeypatch: pytest.MonkeyPatch) -> None: captured: dict[str, object] = {} @@ -268,72 +228,6 @@ def __init__( assert captured["enable_global_documents"] is True -def test_setup_embeddings_service_populates_azure_metadata() -> None: - embeddings = prepdocs.setup_embeddings_service( - open_ai_client=MockClient( - MockEmbeddingsClient( - openai.types.CreateEmbeddingResponse( - object="list", - data=[], - model="text-embedding-3-large", - usage=Usage(prompt_tokens=0, total_tokens=0), - ) - ) - ), - openai_host=prepdocs.OpenAIHost.AZURE, - emb_model_name=MOCK_EMBEDDING_MODEL_NAME, - emb_model_dimensions=MOCK_EMBEDDING_DIMENSIONS, - azure_openai_deployment="deployment", - azure_openai_endpoint="https://service.openai.azure.com", - ) - - assert isinstance(embeddings, OpenAIEmbeddings) - assert embeddings.azure_deployment_name == "deployment" - assert embeddings.azure_endpoint == "https://service.openai.azure.com" - - -def test_setup_embeddings_service_requires_endpoint_for_azure() -> None: - with pytest.raises(ValueError): - prepdocs.setup_embeddings_service( - open_ai_client=MockClient( - MockEmbeddingsClient( - openai.types.CreateEmbeddingResponse( - object="list", - data=[], - model="text-embedding-3-large", - usage=Usage(prompt_tokens=0, total_tokens=0), - ) - ) - ), - openai_host=prepdocs.OpenAIHost.AZURE, - emb_model_name=MOCK_EMBEDDING_MODEL_NAME, - emb_model_dimensions=MOCK_EMBEDDING_DIMENSIONS, - azure_openai_deployment="deployment", - azure_openai_endpoint=None, - ) - - -def test_setup_embeddings_service_requires_deployment_for_azure() -> None: - with pytest.raises(ValueError): - prepdocs.setup_embeddings_service( - open_ai_client=MockClient( - MockEmbeddingsClient( - openai.types.CreateEmbeddingResponse( - object="list", - data=[], - model="text-embedding-3-large", - usage=Usage(prompt_tokens=0, total_tokens=0), - ) - ) - ), - openai_host=prepdocs.OpenAIHost.AZURE, - emb_model_name=MOCK_EMBEDDING_MODEL_NAME, - emb_model_dimensions=MOCK_EMBEDDING_DIMENSIONS, - azure_openai_deployment=None, - azure_openai_endpoint="https://service.openai.azure.com", - ) - - @pytest.mark.asyncio async def test_openai_embeddings_use_deployment_for_azure_model(): class RecordingEmbeddingsClient: @@ -424,99 +318,3 @@ async def run(self) -> None: assert captured["credentials"].key == "secret" assert captured["service_name"] == "searchsvc" assert captured["index_name"] == "searchindex" - - -def test_setup_openai_client_azure_constructs_endpoint_correctly(monkeypatch: pytest.MonkeyPatch) -> None: - """Test that setup_openai_client correctly constructs the Azure OpenAI endpoint URL from service name.""" - captured_base_url: list[str] = [] - - class StubAsyncOpenAI: - def __init__(self, *, base_url: str, api_key, **kwargs) -> None: - captured_base_url.append(base_url) - - monkeypatch.setattr(prepdocs, "AsyncOpenAI", StubAsyncOpenAI) - monkeypatch.setattr(prepdocs, "get_bearer_token_provider", lambda *args, **kwargs: lambda: "fake_token") - - client, endpoint = prepdocs.setup_openai_client( - openai_host=prepdocs.OpenAIHost.AZURE, - azure_credential=MockAzureCredential(), - azure_openai_service="myopenaiservice", - ) - - # Verify the endpoint is constructed correctly - assert endpoint == "https://myopenaiservice.openai.azure.com" - # Verify the base_url includes the endpoint with the openai/v1 suffix - assert captured_base_url[0] == "https://myopenaiservice.openai.azure.com/openai/v1" - - -def test_setup_openai_client_azure_custom_uses_custom_url(monkeypatch: pytest.MonkeyPatch) -> None: - """Test that setup_openai_client uses the custom URL for azure_custom host.""" - captured_base_url: list[str] = [] - - class StubAsyncOpenAI: - def __init__(self, *, base_url: str, api_key, **kwargs) -> None: - captured_base_url.append(base_url) - - monkeypatch.setattr(prepdocs, "AsyncOpenAI", StubAsyncOpenAI) - - client, endpoint = prepdocs.setup_openai_client( - openai_host=prepdocs.OpenAIHost.AZURE_CUSTOM, - azure_credential=MockAzureCredential(), - azure_openai_custom_url="https://custom.endpoint.com/openai", - azure_openai_api_key="test-key", - ) - - # Verify the custom URL is used - assert captured_base_url[0] == "https://custom.endpoint.com/openai" - # Verify endpoint is None for custom URLs - assert endpoint is None - - -def test_setup_openai_client_azure_respects_api_key(monkeypatch: pytest.MonkeyPatch) -> None: - """Test that setup_openai_client uses the API key override when provided.""" - captured_api_key: list[str] = [] - - class StubAsyncOpenAI: - def __init__(self, *, base_url: str, api_key: str, **kwargs) -> None: - captured_api_key.append(api_key) - - monkeypatch.setattr(prepdocs, "AsyncOpenAI", StubAsyncOpenAI) - - client, endpoint = prepdocs.setup_openai_client( - openai_host=prepdocs.OpenAIHost.AZURE, - azure_credential=MockAzureCredential(), - azure_openai_service="myopenaiservice", - azure_openai_api_key="my-api-key-override", - ) - - assert captured_api_key[0] == "my-api-key-override" - - -def test_setup_openai_client_openai_requires_api_key() -> None: - """Test that setup_openai_client raises ValueError when using OpenAI without API key.""" - with pytest.raises(ValueError, match="OpenAI key is required"): - prepdocs.setup_openai_client( - openai_host=prepdocs.OpenAIHost.OPENAI, - azure_credential=MockAzureCredential(), - openai_api_key=None, - ) - - -def test_setup_openai_client_azure_requires_service() -> None: - """Test that setup_openai_client raises ValueError when using Azure without service name.""" - with pytest.raises(ValueError, match="AZURE_OPENAI_SERVICE must be set"): - prepdocs.setup_openai_client( - openai_host=prepdocs.OpenAIHost.AZURE, - azure_credential=MockAzureCredential(), - azure_openai_service=None, - ) - - -def test_setup_openai_client_azure_custom_requires_url() -> None: - """Test that setup_openai_client raises ValueError when using azure_custom without custom URL.""" - with pytest.raises(ValueError, match="AZURE_OPENAI_CUSTOM_URL must be set"): - prepdocs.setup_openai_client( - openai_host=prepdocs.OpenAIHost.AZURE_CUSTOM, - azure_credential=MockAzureCredential(), - azure_openai_custom_url=None, - ) diff --git a/tests/test_prepdocslib_filestrategy.py b/tests/test_prepdocslib_filestrategy.py index 882832e739..d8e8543be7 100644 --- a/tests/test_prepdocslib_filestrategy.py +++ b/tests/test_prepdocslib_filestrategy.py @@ -100,3 +100,147 @@ async def mock_upload_documents(self, documents): "storageUrl": "https://test.blob.core.windows.net/c.txt", }, ] + + +@pytest.mark.asyncio +async def test_parse_file_with_images(monkeypatch): + """Test that parse_file processes images and logs appropriately.""" + from io import BytesIO + + from prepdocslib.filestrategy import parse_file + from prepdocslib.listfilestrategy import File + from prepdocslib.page import ImageOnPage, Page + + # Create a mock file + mock_file = File(content=BytesIO(b"test content")) + mock_file.filename = lambda: "test.txt" + + # Create a mock processor + mock_parser = type("MockParser", (), {})() + + async def mock_parse(content): + # Create a page with an image + image = ImageOnPage( + bytes=b"fake_image", + bbox=(0, 0, 100, 100), + page_num=1, + figure_id="fig_1", + filename="test_image.png", + placeholder='
', + ) + page = Page(page_num=1, text="Some text", offset=0) + page.images = [image] + yield page + + mock_parser.parse = mock_parse + + mock_splitter = type("MockSplitter", (), {})() + mock_processor = type("MockProcessor", (), {"parser": mock_parser, "splitter": mock_splitter})() + + # Create mock blob manager + mock_blob_manager = type("MockBlobManager", (), {})() + + async def mock_upload(*args, **kwargs): + return "https://example.com/image.png" + + mock_blob_manager.upload_document_image = mock_upload + + # Create mock figure processor + mock_figure_processor = type("MockFigureProcessor", (), {})() + + async def mock_describe(bytes): + return "A test image" + + mock_figure_processor.describe = mock_describe + + # Mock process_text to return sections + def mock_process_text(pages, file, splitter, category): + return [] + + monkeypatch.setattr("prepdocslib.filestrategy.process_text", mock_process_text) + + # Call parse_file + sections = await parse_file( + mock_file, + {".txt": mock_processor}, + category=None, + blob_manager=mock_blob_manager, + image_embeddings_client=None, + figure_processor=mock_figure_processor, + user_oid=None, + ) + + assert sections == [] + + +@pytest.mark.asyncio +async def test_file_strategy_setup_with_content_understanding(monkeypatch, mock_env): + """Test that FileStrategy.setup() properly initializes content understanding.""" + from prepdocslib.figureprocessor import FigureProcessor, MediaDescriptionStrategy + from prepdocslib.listfilestrategy import LocalListFileStrategy + + # Create mock list strategy + list_strategy = LocalListFileStrategy(path_pattern="*.txt") + + # Create blob manager + blob_manager = BlobManager( + endpoint=f"https://{os.environ['AZURE_STORAGE_ACCOUNT']}.blob.core.windows.net", + credential=MockAzureCredential(), + container=os.environ["AZURE_STORAGE_CONTAINER"], + account=os.environ["AZURE_STORAGE_ACCOUNT"], + resource_group=os.environ["AZURE_STORAGE_RESOURCE_GROUP"], + subscription_id=os.environ["AZURE_SUBSCRIPTION_ID"], + ) + + # Create search info + search_info = SearchInfo( + endpoint="https://testsearchclient.blob.core.windows.net", + credential=MockAzureCredential(), + index_name="test", + ) + + # Create mock content understanding describer + class MockContentUnderstandingDescriber: + def __init__(self, endpoint, credential): + self.endpoint = endpoint + self.credential = credential + self.create_analyzer_called = False + + async def create_analyzer(self): + self.create_analyzer_called = True + + # Monkeypatch the ContentUnderstandingDescriber in multiple places + monkeypatch.setattr("prepdocslib.figureprocessor.ContentUnderstandingDescriber", MockContentUnderstandingDescriber) + monkeypatch.setattr("prepdocslib.filestrategy.ContentUnderstandingDescriber", MockContentUnderstandingDescriber) + + # Create figure processor with content understanding + figure_processor = FigureProcessor( + strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, + credential=MockAzureCredential(), + content_understanding_endpoint="https://example.com", + ) + + # Mock create_index + async def mock_create_index(self): + pass + + monkeypatch.setattr("prepdocslib.searchmanager.SearchManager.create_index", mock_create_index) + + # Create file strategy + file_strategy = FileStrategy( + list_file_strategy=list_strategy, + blob_manager=blob_manager, + search_info=search_info, + file_processors={".txt": FileProcessor(TextParser(), SimpleTextSplitter())}, + figure_processor=figure_processor, + ) + + # Call setup + await file_strategy.setup() + + # Verify content understanding was initialized during setup + assert figure_processor._media_describer is not None + assert isinstance(figure_processor._media_describer, MockContentUnderstandingDescriber) + # create_analyzer should be called during setup for content understanding + assert figure_processor._media_describer.create_analyzer_called + assert figure_processor._content_understanding_ready diff --git a/tests/test_servicesetup.py b/tests/test_servicesetup.py new file mode 100644 index 0000000000..f4f657eccb --- /dev/null +++ b/tests/test_servicesetup.py @@ -0,0 +1,339 @@ +import pytest + +from prepdocslib.servicesetup import ( + OpenAIHost, + setup_blob_manager, + setup_embeddings_service, + setup_openai_client, +) + +from .mocks import ( + MOCK_EMBEDDING_DIMENSIONS, + MOCK_EMBEDDING_MODEL_NAME, + MockAzureCredential, +) +from .test_prepdocs import MockClient, MockEmbeddingsClient + + +def test_setup_blob_manager_respects_storage_key(monkeypatch: pytest.MonkeyPatch) -> None: + captured: dict[str, object] = {} + + class StubBlobManager: + def __init__( + self, + *, + endpoint: str, + container: str, + account: str, + credential: object, + resource_group: str, + subscription_id: str, + image_container: str | None = None, + ) -> None: + captured["endpoint"] = endpoint + captured["container"] = container + captured["account"] = account + captured["credential"] = credential + captured["resource_group"] = resource_group + captured["subscription_id"] = subscription_id + captured["image_container"] = image_container + + import prepdocslib.servicesetup as servicesetup_module + + monkeypatch.setattr(servicesetup_module, "BlobManager", StubBlobManager) + + result = setup_blob_manager( + azure_credential=MockAzureCredential(), + storage_account="storageacct", + storage_container="docs", + storage_resource_group="rg", + subscription_id="sub-id", + storage_key="override-key", + image_storage_container="images", + ) + + assert isinstance(result, StubBlobManager) + assert captured["credential"] == "override-key" + assert captured["image_container"] == "images" + + +def test_setup_embeddings_service_populates_azure_metadata() -> None: + import openai + from openai.types.create_embedding_response import Usage + + embeddings = setup_embeddings_service( + open_ai_client=MockClient( + MockEmbeddingsClient( + openai.types.CreateEmbeddingResponse( + object="list", + data=[], + model="text-embedding-3-large", + usage=Usage(prompt_tokens=0, total_tokens=0), + ) + ) + ), + openai_host=OpenAIHost.AZURE, + emb_model_name=MOCK_EMBEDDING_MODEL_NAME, + emb_model_dimensions=MOCK_EMBEDDING_DIMENSIONS, + azure_openai_deployment="deployment", + azure_openai_endpoint="https://service.openai.azure.com", + ) + + from prepdocslib.embeddings import OpenAIEmbeddings + + assert isinstance(embeddings, OpenAIEmbeddings) + assert embeddings.azure_deployment_name == "deployment" + assert embeddings.azure_endpoint == "https://service.openai.azure.com" + + +def test_setup_embeddings_service_requires_endpoint_for_azure() -> None: + import openai + from openai.types.create_embedding_response import Usage + + with pytest.raises(ValueError): + setup_embeddings_service( + open_ai_client=MockClient( + MockEmbeddingsClient( + openai.types.CreateEmbeddingResponse( + object="list", + data=[], + model="text-embedding-3-large", + usage=Usage(prompt_tokens=0, total_tokens=0), + ) + ) + ), + openai_host=OpenAIHost.AZURE, + emb_model_name=MOCK_EMBEDDING_MODEL_NAME, + emb_model_dimensions=MOCK_EMBEDDING_DIMENSIONS, + azure_openai_deployment="deployment", + azure_openai_endpoint=None, + ) + + +def test_setup_embeddings_service_requires_deployment_for_azure() -> None: + import openai + from openai.types.create_embedding_response import Usage + + with pytest.raises(ValueError): + setup_embeddings_service( + open_ai_client=MockClient( + MockEmbeddingsClient( + openai.types.CreateEmbeddingResponse( + object="list", + data=[], + model="text-embedding-3-large", + usage=Usage(prompt_tokens=0, total_tokens=0), + ) + ) + ), + openai_host=OpenAIHost.AZURE, + emb_model_name=MOCK_EMBEDDING_MODEL_NAME, + emb_model_dimensions=MOCK_EMBEDDING_DIMENSIONS, + azure_openai_deployment=None, + azure_openai_endpoint="https://service.openai.azure.com", + ) + + +def test_setup_openai_client_azure_constructs_endpoint_correctly(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that setup_openai_client correctly constructs the Azure OpenAI endpoint URL from service name.""" + captured_base_url: list[str] = [] + + class StubAsyncOpenAI: + def __init__(self, *, base_url: str, api_key, **kwargs) -> None: + captured_base_url.append(base_url) + + import prepdocslib.servicesetup as servicesetup_module + + monkeypatch.setattr(servicesetup_module, "AsyncOpenAI", StubAsyncOpenAI) + monkeypatch.setattr(servicesetup_module, "get_bearer_token_provider", lambda *args, **kwargs: lambda: "fake_token") + + client, endpoint = setup_openai_client( + openai_host=OpenAIHost.AZURE, + azure_credential=MockAzureCredential(), + azure_openai_service="myopenaiservice", + ) + + # Verify the endpoint is constructed correctly + assert endpoint == "https://myopenaiservice.openai.azure.com" + # Verify the base_url includes the endpoint with the openai/v1 suffix + assert captured_base_url[0] == "https://myopenaiservice.openai.azure.com/openai/v1" + + +def test_setup_openai_client_azure_custom_uses_custom_url(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that setup_openai_client uses the custom URL for azure_custom host.""" + captured_base_url: list[str] = [] + + class StubAsyncOpenAI: + def __init__(self, *, base_url: str, api_key, **kwargs) -> None: + captured_base_url.append(base_url) + + import prepdocslib.servicesetup as servicesetup_module + + monkeypatch.setattr(servicesetup_module, "AsyncOpenAI", StubAsyncOpenAI) + + client, endpoint = setup_openai_client( + openai_host=OpenAIHost.AZURE_CUSTOM, + azure_credential=MockAzureCredential(), + azure_openai_custom_url="https://custom.endpoint.com/openai", + azure_openai_api_key="test-key", + ) + + # Verify the custom URL is used + assert captured_base_url[0] == "https://custom.endpoint.com/openai" + # Verify endpoint is None for custom URLs + assert endpoint is None + + +def test_setup_openai_client_azure_respects_api_key(monkeypatch: pytest.MonkeyPatch) -> None: + """Test that setup_openai_client uses the API key override when provided.""" + captured_api_key: list[str] = [] + + class StubAsyncOpenAI: + def __init__(self, *, base_url: str, api_key: str, **kwargs) -> None: + captured_api_key.append(api_key) + + import prepdocslib.servicesetup as servicesetup_module + + monkeypatch.setattr(servicesetup_module, "AsyncOpenAI", StubAsyncOpenAI) + + client, endpoint = setup_openai_client( + openai_host=OpenAIHost.AZURE, + azure_credential=MockAzureCredential(), + azure_openai_service="myopenaiservice", + azure_openai_api_key="my-api-key-override", + ) + + assert captured_api_key[0] == "my-api-key-override" + + +def test_setup_openai_client_openai_requires_api_key() -> None: + """Test that setup_openai_client raises ValueError when using OpenAI without API key.""" + with pytest.raises(ValueError, match="OpenAI key is required"): + setup_openai_client( + openai_host=OpenAIHost.OPENAI, + azure_credential=MockAzureCredential(), + openai_api_key=None, + ) + + +def test_setup_openai_client_azure_requires_service() -> None: + """Test that setup_openai_client raises ValueError when using Azure without service name.""" + with pytest.raises(ValueError, match="AZURE_OPENAI_SERVICE must be set"): + setup_openai_client( + openai_host=OpenAIHost.AZURE, + azure_credential=MockAzureCredential(), + azure_openai_service=None, + ) + + +def test_setup_openai_client_azure_custom_requires_url() -> None: + """Test that setup_openai_client raises ValueError when using azure_custom without custom URL.""" + with pytest.raises(ValueError, match="AZURE_OPENAI_CUSTOM_URL must be set"): + setup_openai_client( + openai_host=OpenAIHost.AZURE_CUSTOM, + azure_credential=MockAzureCredential(), + azure_openai_custom_url=None, + ) + + +def test_setup_search_info_agentic_retrieval_without_model(): + """Test that setup_search_info raises ValueError when using agentic retrieval without search agent model.""" + from prepdocslib.servicesetup import setup_search_info + + with pytest.raises(ValueError, match="SearchAgent model must be specified"): + setup_search_info( + azure_credential=MockAzureCredential(), + search_service="mysearch", + index_name="myindex", + use_agentic_retrieval=True, + azure_openai_searchagent_model=None, + ) + + +def test_setup_image_embeddings_multimodal_without_vision(): + """Test that setup_image_embeddings_service raises ValueError when using multimodal without vision endpoint.""" + from prepdocslib.servicesetup import setup_image_embeddings_service + + with pytest.raises(ValueError, match="Azure AI Vision endpoint must be provided"): + setup_image_embeddings_service( + use_multimodal=True, + vision_endpoint=None, + azure_credential=MockAzureCredential(), + ) + + +def test_setup_figure_processor_content_understanding(): + """Test that setup_figure_processor returns correct processor for content understanding.""" + from prepdocslib.figureprocessor import FigureProcessor, MediaDescriptionStrategy + from prepdocslib.servicesetup import setup_figure_processor + + processor = setup_figure_processor( + use_multimodal=False, + use_content_understanding=True, + content_understanding_endpoint="https://example.com", + credential=MockAzureCredential(), + openai_client=None, + openai_model=None, + openai_deployment=None, + ) + + assert isinstance(processor, FigureProcessor) + assert processor.strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING + + +def test_setup_parser_document_intelligence_with_key(): + """Test that select_parser uses key credential when provided.""" + from prepdocslib.pdfparser import DocumentAnalysisParser + from prepdocslib.servicesetup import select_parser + + parser = select_parser( + file_name="test.pdf", + content_type="application/pdf", + azure_credential=MockAzureCredential(), + document_intelligence_service="myservice", + document_intelligence_key="my-key", + use_local_html_parser=False, + ) + + assert isinstance(parser, DocumentAnalysisParser) + + +def test_setup_parser_text_file(): + """Test that select_parser returns TextParser for text files.""" + from prepdocslib.servicesetup import select_parser + from prepdocslib.textparser import TextParser + + parser = select_parser( + file_name="test.txt", + content_type="text/plain", + azure_credential=MockAzureCredential(), + ) + + assert isinstance(parser, TextParser) + + +def test_setup_parser_application_type_with_di(): + """Test that select_parser uses DI for application/* content types.""" + from prepdocslib.pdfparser import DocumentAnalysisParser + from prepdocslib.servicesetup import select_parser + + parser = select_parser( + file_name="test.unknown", + content_type="application/unknown", + azure_credential=MockAzureCredential(), + document_intelligence_service="myservice", + ) + + assert isinstance(parser, DocumentAnalysisParser) + + +def test_setup_parser_unsupported_file_type(): + """Test that select_parser raises ValueError for unsupported file types.""" + from prepdocslib.servicesetup import select_parser + + with pytest.raises(ValueError, match="Unsupported file type"): + select_parser( + file_name="test.xyz", + content_type="application/xyz", + azure_credential=MockAzureCredential(), + ) diff --git a/tests/test_textprocessor.py b/tests/test_textprocessor.py new file mode 100644 index 0000000000..1ff739280d --- /dev/null +++ b/tests/test_textprocessor.py @@ -0,0 +1,70 @@ +from prepdocslib.page import ImageOnPage, Page +from prepdocslib.textprocessor import combine_text_with_figures + + +def test_combine_text_with_figures_no_description(): + """Test combine_text_with_figures when image has no description.""" + image = ImageOnPage( + bytes=b"fake", + bbox=(0, 0, 100, 100), + filename="test.png", + page_num=1, + figure_id="fig_1", + placeholder="[PLACEHOLDER_fig_1]", + description=None, + ) + + page = Page(page_num=1, text="Some text [PLACEHOLDER_fig_1] more text", offset=0) + page.images = [image] + + # Should keep placeholder when no description + combine_text_with_figures(page) + + assert "[PLACEHOLDER_fig_1]" in page.text + assert "
" not in page.text + + +def test_combine_text_with_figures_placeholder_not_found(caplog): + """Test combine_text_with_figures when placeholder is not in text.""" + import logging + + image = ImageOnPage( + bytes=b"fake", + bbox=(0, 0, 100, 100), + filename="test.png", + page_num=1, + figure_id="fig_1", + placeholder="[PLACEHOLDER_fig_1]", + description="A test image", + ) + + page = Page(page_num=1, text="Some text without placeholder", offset=0) + page.images = [image] + + with caplog.at_level(logging.WARNING): + combine_text_with_figures(page) + + assert "Placeholder not found for figure fig_1" in caplog.text + + +def test_combine_text_with_figures_replaces_successfully(): + """Test combine_text_with_figures successfully replaces placeholder.""" + image = ImageOnPage( + bytes=b"fake", + bbox=(0, 0, 100, 100), + filename="test.png", + page_num=1, + figure_id="fig_1", + title="Test Figure", + placeholder="[PLACEHOLDER_fig_1]", + description="A test image", + ) + + page = Page(page_num=1, text="Some text [PLACEHOLDER_fig_1] more text", offset=0) + page.images = [image] + + combine_text_with_figures(page) + + assert "[PLACEHOLDER_fig_1]" not in page.text + assert "
" in page.text + assert "A test image" in page.text From be980041dabb2b1a3715bc63c867099b9f45be5e Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 11 Nov 2025 14:47:47 -0800 Subject: [PATCH 18/30] Consolidate docs --- app/backend/prepdocslib/blobmanager.py | 1 + .../prepdocslib/cloudingestionstrategy.py | 55 +- app/backend/prepdocslib/page.py | 18 +- app/backend/setup_cloud_ingestion.py | 9 +- docs/cloud_ingestion.md | 880 ------------------ docs/data_ingestion.md | 46 +- scripts/compare_search_indexes.py | 376 -------- tests/test_servicesetup.py | 2 + 8 files changed, 89 insertions(+), 1298 deletions(-) delete mode 100644 docs/cloud_ingestion.md delete mode 100644 scripts/compare_search_indexes.py diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index 167c0eeae4..f682ec5029 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -432,6 +432,7 @@ async def upload_blob(self, file: File) -> str: blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True) file.url = blob_client.url + assert file.url is not None, "file.url must be set after upload" return unquote(file.url) async def upload_document_image( diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py index 40ea7acbc9..745b48c042 100644 --- a/app/backend/prepdocslib/cloudingestionstrategy.py +++ b/app/backend/prepdocslib/cloudingestionstrategy.py @@ -71,7 +71,7 @@ def __init__( use_acls: bool = False, use_multimodal: bool = False, enforce_access_control: bool = False, - search_user_assigned_identity_resource_id: str | None = None, + search_user_assigned_identity_resource_id: str, ) -> None: self.list_file_strategy = list_file_strategy self.blob_manager = blob_manager @@ -297,38 +297,31 @@ async def setup(self) -> None: skillset = self._build_skillset() await indexer_client.create_or_update_skillset(skillset) + indexer = SearchIndexer( + name=self.indexer_name, + description="Indexer orchestrating cloud ingestion pipeline", + data_source_name=self.data_source_name, + target_index_name=self.search_info.index_name, + skillset_name=self.skillset_name, + parameters=IndexingParameters( + configuration=IndexingParametersConfiguration( + query_timeout=None, # type: ignore + data_to_extract="storageMetadata", + allow_skillset_to_read_file_data=True, + ) + ), + ) + await indexer_client.create_or_update_indexer(indexer) + async def run(self) -> None: - if self.document_action == DocumentAction.Add: - files = self.list_file_strategy.list() - async for file in files: - try: - await self.blob_manager.upload_blob(file) - finally: - if file: - file.close() - elif self.document_action == DocumentAction.Remove: - paths = self.list_file_strategy.list_paths() - async for path in paths: - await self.blob_manager.remove_blob(path) - elif self.document_action == DocumentAction.RemoveAll: - await self.blob_manager.remove_blob() - - indexer = SearchIndexer( - name=self.indexer_name, - description="Indexer orchestrating cloud ingestion pipeline", - data_source_name=self.data_source_name, - target_index_name=self.search_info.index_name, - skillset_name=self.skillset_name, - parameters=IndexingParameters( - configuration=IndexingParametersConfiguration( - query_timeout=None, - data_to_extract="storageMetadata", - allow_skillset_to_read_file_data=True, - ) - ), - ) + files = self.list_file_strategy.list() + async for file in files: + try: + await self.blob_manager.upload_blob(file) + finally: + if file: + file.close() async with self.search_info.create_search_indexer_client() as indexer_client: - await indexer_client.create_or_update_indexer(indexer) await indexer_client.run_indexer(self.indexer_name) logger.info("Triggered indexer '%s' for cloud ingestion", self.indexer_name) diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index 0015df05ad..e9962fdf3e 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -85,15 +85,25 @@ def from_skill_payload(cls, data: dict[str, Any]) -> tuple["ImageOnPage", str]: else: bbox = (0, 0, 0, 0) + filename = data.get("filename") + figure_id = data.get("figure_id") + placeholder = data.get("placeholder") + assert filename is not None, "filename is required" + assert figure_id is not None, "figure_id is required" + + # Generate placeholder if not provided + if placeholder is None: + placeholder = f'
' + image = cls( bytes=raw_bytes, bbox=bbox, page_num=page_num, - filename=data.get("filename"), - figure_id=data.get("figure_id"), - placeholder=data.get("placeholder"), + filename=filename, + figure_id=figure_id, + placeholder=placeholder, mime_type=data.get("mime_type") or "image/png", - title=data.get("title"), + title=data.get("title") or "", description=data.get("description"), url=data.get("url"), ) diff --git a/app/backend/setup_cloud_ingestion.py b/app/backend/setup_cloud_ingestion.py index c6e945b8a8..94e980850a 100644 --- a/app/backend/setup_cloud_ingestion.py +++ b/app/backend/setup_cloud_ingestion.py @@ -7,9 +7,11 @@ from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential +from openai import AsyncOpenAI from rich.logging import RichHandler from load_azd_env import load_azd_env +from prepdocslib.blobmanager import BlobManager from prepdocslib.cloudingestionstrategy import CloudIngestionStrategy from prepdocslib.servicesetup import ( OpenAIHost, @@ -33,13 +35,13 @@ def clean_key_if_exists(key: Optional[str]) -> Optional[str]: async def setup_cloud_ingestion_strategy( azure_credential: AsyncTokenCredential, document_action: DocumentAction = DocumentAction.Add, -) -> CloudIngestionStrategy: +) -> tuple[CloudIngestionStrategy, AsyncOpenAI, AsyncTokenCredential, BlobManager]: """Setup the cloud ingestion strategy with all required services.""" # Get environment variables search_service = os.environ["AZURE_SEARCH_SERVICE"] index_name = os.environ["AZURE_SEARCH_INDEX"] - search_user_assigned_identity_resource_id = os.environ.get("AZURE_SEARCH_USER_ASSIGNED_IDENTITY_RESOURCE_ID") + search_user_assigned_identity_resource_id = os.environ["AZURE_SEARCH_USER_ASSIGNED_IDENTITY_RESOURCE_ID"] storage_account = os.environ["AZURE_STORAGE_ACCOUNT"] storage_container = os.environ["AZURE_STORAGE_CONTAINER"] storage_resource_group = os.environ["AZURE_STORAGE_RESOURCE_GROUP"] @@ -168,7 +170,8 @@ async def main(): # Setup the indexer, skillset, and data source logger.info("Setting up indexer, skillset, and data source...") await ingestion_strategy.setup() - logger.info("Cloud ingestion setup complete!") + logger.info("Triggering initial indexing run...") + await ingestion_strategy.run() finally: # Gracefully close any async clients/credentials diff --git a/docs/cloud_ingestion.md b/docs/cloud_ingestion.md deleted file mode 100644 index fc4cbc23d5..0000000000 --- a/docs/cloud_ingestion.md +++ /dev/null @@ -1,880 +0,0 @@ -# RAG chat: Cloud-Based data ingestion with Azure Functions - -This document describes the cloud-based ingestion architecture that uses Azure Functions as custom skills for Azure AI Search indexer. - -## Overview - -The cloud ingestion strategy provides an alternative to the local script-based ingestion (`scripts/prepdocs.sh`). Instead of processing documents locally and uploading them to Azure AI Search, the cloud approach uses: - -1. **Azure Blob Storage** as the document source -2. **Azure AI Search Indexer** as the orchestration engine -3. **Three Azure Functions** acting as chained custom skills for document processing - -This architecture enables serverless, scalable, and event-driven document processing. - -## Architecture - -TODO: Replace with a mermaid diagram like textsplitter has, -OR use my images from slides. - -```ascii -┌─────────────────────────────────────────────────────────────────┐ -│ USER: Upload files to blob storage (content container) │ -└──────────────────────┬──────────────────────────────────────────┘ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ Azure AI Search Indexer │ -│ - Blob data source (monitors content container) │ -│ - Skillset with 4 chained skills (3 custom + 1 built-in) │ -│ - Runs on schedule or on-demand │ -│ - Handles retries, checkpointing, state tracking │ -└──────────────────────┬──────────────────────────────────────────┘ - │ - ▼ - ┌──────────────────────────────┐ - │ SKILL #1: document_extractor│ - │ (Flex Consumption Function) │ - │ HTTP Trigger │ - │ Context: /document │ - │ Timeout: 10 minutes │ - └─────────────┬────────────────┘ - │ - Input: │ Output to /document: - • Blob URL │ • pages[] (text + figure ids) - • File metadata │ • figures[] (metadata + base64 image) - │ - Processes: │ - • Download blob │ - • Document Intelligence - • Figure cropping (PyMuPDF) - • Table extraction - │ - ▼ - ┌─────────────────────────────┐ - │ SKILL #2: figure_processor │ - │ (Flex Consumption Function)│ - │ HTTP Trigger │ - │ Context: /document/figures/*│ - │ Timeout: 6 minutes │ - │ Memory: 3072 MB │ - └─────────────┬───────────────┘ - │ - Input (per figure): │ Output to /document/figures/*: - • Figure bytes │ • description (enriched) - • Figure metadata │ • url (enriched) - • placeholder │ • embedding (enriched) - • title │ - Processes: │ - • Upload to blob │ - • Describe via LLM │ - • Embed via Vision │ - │ - ▼ - ┌─────────────────────────────┐ - │ SKILL #3: Shaper Skill │ - │ (Built-in Azure AI Search) │ - │ Context: /document │ - └─────────────┬───────────────┘ - │ - Purpose: │ Output to /document: - • Consolidate data │ • consolidated_document: - │ - pages[] (from skill #1) - Shaper combines: │ - figures[] (enriched from skill #2) - • Original pages │ - file_name - • Enriched figures │ - storageUrl - • File metadata │ - │ - Why needed: │ - Azure AI Search enrichment tree isolates contexts. - Data enriched at /document/figures/* doesn't automatically - merge into /document scope. Shaper explicitly consolidates - all fields into a single object for downstream consumption. - │ - ▼ - ┌─────────────────────────────┐ - │ SKILL #4: text_processor │ - │ (Combines, splits, embeds) │ - │ HTTP Trigger │ - │ Context: /document │ - │ Timeout: 5 minutes │ - │ Memory: 2048 MB │ - └─────────────┬───────────────┘ - │ - Input: │ Output: - • consolidated_doc │ • Array of chunks with: - - pages[] │ - Content text - - figures[] │ - Text embeddings - - file_name │ - Figure references + embeddings - - storageUrl │ - Metadata (sourcepage, etc.) - Processes: │ - • Enrich placeholders│ - • Split text │ - • Generate embeddings│ - │ - ▼ -┌─────────────────────────────────────────────────────────────────┐ -│ Azure AI Search Index │ -│ Indexer writes enriched documents with embeddings │ -└─────────────────────────────────────────────────────────────────┘ -``` - -## Components - -### 1. Document Extractor Function - -**Location:** `app/functions/document_extractor/` - -**Purpose:** First stage of processing—extracts structured content and raw figure payloads. - -**Responsibilities:** - -- Downloads documents from blob storage. -- Parses documents using Azure Document Intelligence or local fallbacks. -- Extracts tables as HTML fragments. -- Crops figure images with PyMuPDF and serialises them as base64 payloads. -- Emits markdown text containing `
` placeholders and companion metadata arrays. -- Captures per-page metadata linking text passages to figure identifiers. - -**Configuration:** - -- 10-minute timeout (supports large documents and multimodal preprocessing). -- 4096 MB instance memory (for document parsing and image manipulation). -- Python 3.11 runtime. -- Uses managed identity for authentication. - -**Input Format (Azure Search custom skill):** - -```json -{ - "values": [ - { - "recordId": "1", - "data": { - "blobUrl": "https://storage.../content/doc.pdf", - "fileName": "doc.pdf", - "contentType": "application/pdf" - } - } - ] -} -``` - -**Output Format:** - -```json -{ - "values": [ - { - "recordId": "1", - "data": { - "pages": [ - { - "pageNumber": 0, - "text": "Page text with
", - "figureIds": ["fig1"] - } - ], - - "figures": [ - { - "id": "fig1", - "page": 2, - "fileName": "doc.pdf", - "mimeType": "image/png", - "imageBase64": "iVBORw0...", - "bbox": [12.4, 30.1, 180.6, 210.2] - } - ], - "images": [] - }, - "errors": [], - "warnings": [] - } - ] -} -``` - -### 2. Figure Processor Function - -**Location:** `app/functions/figure_processor/` - -**Purpose:** Second stage—turns individual figure payloads into reusable assets and embeddings. - -**Responsibilities:** - -- Uploads figure bytes to blob storage and generates signed URLs or stored paths. -- Produces natural-language captions via GPT-4o or Content Understanding. -- Generates image embeddings via Azure AI Vision (when multimodal is enabled). -- Emits enriched figure metadata for downstream text processing. - -**Configuration:** - -- 6-minute timeout (covers caption plus embedding latency for complex figures). -- 3072 MB instance memory (accommodates concurrent figure batches). -- Python 3.11 runtime. -- Uses managed identity for authentication. - -**Input Format (context `/document/figures/*`):** Azure AI Search expands the skill context, calling the function once per figure and supplying a unique `recordId` for each entry in the `values` array. - -```json -{ - "values": [ - { - "recordId": "1", - "data": { - "id": "fig1", - "fileName": "doc.pdf", - "mimeType": "image/png", - "imageBase64": "iVBORw0...", - "page": 2, - "bbox": [12.4, 30.1, 180.6, 210.2] - } - } - ] -} -``` - -**Output Format:** - -```json -{ - "values": [ - { - "recordId": "1", - "data": { - "url": "https://storage.../images/doc-fig1.png", - "description": "
Bar chart showing quarterly revenue
", - "imageEmbedding": [0.789, -0.012, ...] - }, - "errors": [], - "warnings": [] - } - ] -} -``` - -### 4. Shaper Skill (Built-in) - -**Type:** Built-in Azure AI Search skill - -**Purpose:** Consolidates enrichments from different contexts into a single object. - -**Why Needed:** - -Azure AI Search's enrichment tree isolates data by context. When the `figure_processor` skill runs at context `/document/figures/*`, it enriches individual figure objects (adding `description`, `url`, `embedding`). However, these enrichments remain isolated in the `/document/figures/*` context and don't automatically merge into the `/document` context where the `text_processor` skill operates. - -The Shaper skill explicitly consolidates: - -- Original `pages` array from `document_extractor` -- Enriched `figures` array with `description`, `url`, `embedding` from `figure_processor` -- File metadata (`file_name`, `storageUrl`) - -This consolidated object is then passed to the `text_processor` skill, ensuring it receives all enriched data in a single, well-structured input. - -**Configuration:** - -- Context: `/document` -- Uses nested `inputs` syntax with `source_context` for array consolidation -- Output: `consolidated_document` object containing all required fields - -**Input Mapping:** - -```python -ShaperSkill( - name="document-shaper-skill", - context="/document", - inputs=[ - InputFieldMappingEntry(name="pages", source="/document/pages"), - InputFieldMappingEntry( - name="figures", - source_context="/document/figures/*", - inputs=[ - InputFieldMappingEntry(name="figure_id", source="/document/figures/*/figure_id"), - InputFieldMappingEntry(name="filename", source="/document/figures/*/filename"), - # ... other figure fields - InputFieldMappingEntry(name="description", source="/document/figures/*/description"), - InputFieldMappingEntry(name="url", source="/document/figures/*/url"), - InputFieldMappingEntry(name="embedding", source="/document/figures/*/embedding"), - ] - ), - InputFieldMappingEntry(name="file_name", source="/document/metadata_storage_name"), - InputFieldMappingEntry(name="storageUrl", source="/document/metadata_storage_path"), - ], - outputs=[ - OutputFieldMappingEntry(name="output", target_name="consolidated_document") - ] -) -``` - -**Output Format:** - -The Shaper skill produces a `consolidated_document` object at `/document/consolidated_document`: - -```json -{ - "consolidated_document": { - "pages": [ - {"page_num": 0, "text": "...", "figure_ids": ["1.1"]} - ], - "figures": [ - { - "figure_id": "1.1", - "filename": "figure1_1.png", - "description": "The image shows a logo...", - "url": "https://storage.../images/doc/figure1_1.png", - "embedding": [0.123, -0.456, ...] - } - ], - "file_name": "document.pdf", - "storageUrl": "https://storage.../content/document.pdf" - } -} -``` - -### 5. Text Processor Function - -**Location:** `app/functions/text_processor/` - -**Purpose:** Third stage—recombines enriched figure metadata with text, then produces search-ready chunks. - -**Responsibilities:** - -- Merges processed figure metadata back into markdown placeholders. -- Preserves `
` positioning so figures stay with their surrounding narrative. -- Splits text into semantically meaningful chunks using `SentenceTextSplitter`. -- Generates text embeddings via Azure OpenAI. -- Emits chunk documents referencing figure descriptors and optional image embeddings. - -**Configuration:** - -- 5-minute timeout (sized for batching text embeddings). -- 2048 MB instance memory (increase if batching large embeddings). -- Python 3.11 runtime. -- Uses managed identity for authentication. - -Because the enrichment tree preserves the updated `/document/figures` collection after skill #2 runs, this skill receives a fully enriched array of figure descriptors alongside the markdown source. - -**Input Format:** - -```json -{ - "values": [ - { - "recordId": "1", - "data": { - "text": "# Document...
", - "tables": [...], - "figures": [ - { - "id": "fig1", - "url": "https://storage.../images/doc-fig1.png", - "caption": "Bar chart...", - "imageEmbedding": [0.789, -0.012, ...] - } - ], - "fileName": "doc.pdf" - } - } - ] -} -``` - -**Output Format:** - -```json -{ - "values": [ - { - "recordId": "1", - "data": { - "chunks": [ - { - "id": "doc.pdf-0001", - "content": "Content chunk with
", - "embedding": [0.123, -0.456, ...], - "sourcepage": "doc.pdf-1", - "sourcefile": "doc.pdf", - "images": [ - { - "id": "fig1", - "url": "https://storage.../images/doc-fig1.png", - "caption": "Bar chart...", - "imageEmbedding": [0.789, -0.012, ...] - } - ] - } - ] - }, - "errors": [], - "warnings": [] - } - ] -} - -**Record IDs:** The indexer maintains the original document `recordId` throughout the pipeline. Skills operating on collections (such as `/document/figures/*`) emit per-item suffixes internally, but every response still maps back to the same root document when the enrichment tree is reassembled. -``` - -### 4. Azure AI Search Indexer - -The indexer orchestrates the entire pipeline: - -**Data Source:** - -- Type: `azureblob` -- Container: `content` -- Monitors for new/modified blobs -- Can be configured to track deletions or soft delete markers - -**Skillset:** - -- Custom skill #1 (`/document` context): `document_extractor` (emits per-page text and figure payloads). -- Custom skill #2 (`/document/figures/*` context): `figure_processor` (fans out automatically so each figure is enriched independently before being merged back into `/document/figures`). -- Custom skill #3 (`/document` context): `text_processor` (combines markdown with enriched figures, then produces chunks/embeddings). -- Skill #3 consumes the per-page text output from skill #1 and the enriched figures output from skill #2. - -**Indexer:** - -- Runs on schedule (e.g., every 5 minutes) or on-demand -- Batch size: Configurable (e.g., 10 documents per batch) -- Handles retries with exponential backoff -- Tracks processing state per document -- Supports incremental updates (only processes changed documents) - -## Shared Code: prepdocslib - -All three functions share the same processing logic used by the local ingestion script. - -**Location:** `app/backend/prepdocslib/` - -**Shared Modules:** - -- `pdfparser.py` - Document Intelligence and local PDF parsing -- `htmlparser.py` - HTML parsing -- `textparser.py` - Plain text parsing -- `textsplitter.py` - `SentenceTextSplitter` for semantic chunking -- `embeddings.py` - Azure OpenAI and image embedding services -- `blobmanager.py` - Blob storage operations -- `mediadescriber.py` - Figure description using GPT-4o or Content Understanding - -**Deployment:** - -Each function includes `../../backend/prepdocslib` in `requirements.txt` as a local dependency. During deployment: - -1. `pip` resolves the local path dependency. -2. The function deployment packages `prepdocslib` with all its dependencies. -3. The complete package is uploaded to the function's deployment blob container. - -## Configuration - -### Environment Variables (Function Apps) - -Both functions receive the same configuration as the backend app: - -**Azure Services:** - -```bash -# Storage -AZURE_STORAGE_ACCOUNT= -AZURE_STORAGE_CONTAINER=content -AZURE_IMAGESTORAGE_CONTAINER=images - -# Azure OpenAI -AZURE_OPENAI_SERVICE= -AZURE_OPENAI_EMB_DEPLOYMENT= -AZURE_OPENAI_EMB_MODEL_NAME=text-embedding-3-large -AZURE_OPENAI_EMB_DIMENSIONS=3072 - -# Document Intelligence -AZURE_DOCUMENTINTELLIGENCE_SERVICE= - -# Azure AI Vision (for multimodal) -AZURE_VISION_ENDPOINT= - -# Azure AI Search -AZURE_SEARCH_SERVICE= -AZURE_SEARCH_INDEX=gptkbindex -``` - -**Custom Skill Endpoints:** - -```bash -DOCUMENT_EXTRACTOR_SKILL_ENDPOINT=https://... -DOCUMENT_EXTRACTOR_SKILL_RESOURCE_ID=api://... -FIGURE_PROCESSOR_SKILL_ENDPOINT=https://... -FIGURE_PROCESSOR_SKILL_RESOURCE_ID=api://... -TEXT_PROCESSOR_SKILL_ENDPOINT=https://... -TEXT_PROCESSOR_SKILL_RESOURCE_ID=api://... -``` - -**Feature Flags:** - -```bash -USE_VECTORS=true -USE_MULTIMODAL=false -USE_LOCAL_PDF_PARSER=false -USE_LOCAL_HTML_PARSER=false -USE_MEDIA_DESCRIBER_AZURE_CU=false -``` - -**Authentication:** -All functions use **managed identity** (no connection strings or keys). - -### Bicep Parameters - -**infra/main.parameters.json:** - -```json -{ - "useCloudIngestion": { - "value": "${USE_CLOUD_INGESTION=false}" - } -} -``` - -When `useCloudIngestion=true`: - -- Deploys three Azure Functions (document_extractor, figure_processor, text_processor) on the Flex Consumption plan. -- Creates managed identities with appropriate role assignments (Storage, Search, Document Intelligence, OpenAI, Vision). -- Provisions the indexer, skillset, and data source. -- Configures the backend to use cloud ingestion. - -## Local vs Cloud Ingestion - -### Local Ingestion (Default) - -**Command:** `./scripts/prepdocs.sh` - -**Process:** - -1. Run `prepdocs.py` locally. -2. Upload documents to blob storage. -3. Process documents locally (parse, split, embed). -4. Upload chunks directly to Azure AI Search index. -5. MD5 tracking to skip unchanged files. - -**Use Cases:** - -- Initial data seeding. -- Development and testing. -- CI/CD pipelines. -- Small datasets. -- When you need immediate control. - -**Pros:** - -- Simple, direct control. -- Fast for small datasets. -- Works offline (can process locally before uploading). - -**Cons:** - -- Not scalable for large datasets. -- Requires local compute resources. -- No automatic incremental updates. - -### Cloud Ingestion - -**Command:** `./scripts/prepdocs.sh --use-cloud-ingestion` - -**Process:** - -1. Upload documents to blob storage only. -2. Indexer automatically detects new/changed documents. -3. Functions process documents in parallel (figures and text scale independently). -4. Chunks written directly to search index. -5. Indexer tracks state (no MD5 needed). - -**Use Cases:** - -- Production environments. -- Large datasets. -- Continuous ingestion (monitoring blob container). -- Event-driven processing. -- Horizontal scaling requirements. - -**Pros:** - -- Serverless, scales automatically (up to 1000 instances). -- No local compute needed. -- Built-in retry and error handling. -- Incremental updates (only processes changes). -- Cost-effective (pay only when processing). - -**Cons:** - -- Slightly more complex setup. -- Depends on Azure services being available. -- Indexer scheduling introduces latency (configurable). - -## MD5 Tracking - -**Local ingestion** uses MD5 files to track uploaded documents: - -- MD5 hash stored in `data/*.md5` files. -- Used to skip re-uploading unchanged files to blob storage. -- Still needed even with cloud ingestion for initial uploads. - -**Cloud ingestion** does not need MD5 for processing: - -- Indexer uses blob `lastModified` timestamp. -- Automatically detects new and changed documents. -- No MD5 files created for processed chunks. - -## Deployment - -### Prerequisites - -1. Azure CLI with Functions extension: - - ```bash - az extension add --name functions - ``` - -1. Azure Functions Core Tools v4: - - ```bash - brew install azure-functions-core-tools@4 # macOS - ``` - -### Deploy Infrastructure - -```bash -azd provision -``` - -This creates: - -- Function App (Flex Consumption plan). -- Three function deployments (`document_extractor`, `figure_processor`, `text_processor`). -- Managed identities. -- Role assignments (Storage, OpenAI, Document Intelligence, Vision, Search). -- Indexer, skillset, and data source (if `USE_CLOUD_INGESTION=true`). - -### Deploy Function Code - -Functions are deployed as part of `azd up` / `azd deploy`. Manual `func azure functionapp publish` steps are not supported in this workflow—always let azd handle packaging, app settings, and managed identity assignments so that the skillset stays in sync with infrastructure as code. - -### Upload Initial Data - -```bash -# Upload documents (triggers indexer if cloud ingestion enabled) -./scripts/prepdocs.sh - -# Or explicitly use cloud ingestion -./scripts/prepdocs.sh --use-cloud-ingestion -``` - -## Monitoring - -### Application Insights - -All three functions send telemetry to Application Insights: - -- Request duration and success rate. -- Custom skill execution metrics. -- Error logs with stack traces. -- Performance counters. - -**View in Azure Portal:** - -1. Navigate to Function App → Application Insights. -2. Check "Live Metrics" for real-time monitoring. -3. Use "Failures" blade for error analysis. - -### Indexer Status - -Check indexer execution history: - -```bash -# Azure CLI -az search indexer show \ - --service-name \ - --name - -# Or via Azure Portal -# Navigate to Search Service → Indexers → View execution history -``` - -### Function Logs - -Stream function logs in real-time: - -```bash -func azure functionapp logstream -``` - -## Troubleshooting - -### Function Timeouts - -**Symptom:** Functions timing out on large documents - -**Solution:** - -- Increase `functionTimeout` in `host.json` (max 10 minutes for `document_extractor`). -- Increase instance memory (2048 MB or 4096 MB). -- Consider splitting very large documents before upload. - -### Embedding Rate Limits - -**Symptom:** HTTP 429 errors from OpenAI - -**Solution:** - -- The embedding service includes retry logic with exponential backoff. -- Reduce indexer batch size to process fewer documents concurrently. -- Increase OpenAI deployment capacity (TPM). - -### Missing Images - -**Symptom:** Figures not appearing in search results - -**Solution:** - -- Verify `USE_MULTIMODAL=true` is set. -- Check that images container has proper CORS settings. -- Verify function has "Storage Blob Data Contributor" role. -- Check Application Insights for image upload errors. - -### Indexer Failures - -**Symptom:** Indexer shows failed executions - -**Solution:** - -- Check indexer execution history for error details. -- Verify custom skill URLs are accessible (not 404). -- Check function authentication (managed identity or keys configured in skillset). -- Review function logs in Application Insights. - -## Cost Optimization - -### Function App - -**Flex Consumption Billing:** - -- Execution time × memory provisioned (GB-seconds). -- Number of executions. -- Always-ready instances (if configured). - -**Tips:** - -- Use 2048 MB memory unless you need more (text processing or multimodal workloads). -- Set appropriate timeouts (don't over-provision). -- Don't use always-ready instances for this workload (batch processing). - -### Indexer - -**Indexer Runs:** - -- Free tier: Limited indexer runs per day. -- Standard tier: Unlimited runs. - -**Tips:** - -- Adjust schedule based on upload frequency (don't run too frequently). -- Use on-demand indexer runs for manual uploads. -- Enable "high water mark" change detection (only processes new/changed docs). - -## Security - -### Managed Identities - -All authentication uses **managed identities** (no secrets): - -- Function App → Storage (read content, write images). -- Function App → OpenAI (embeddings, GPT-4o). -- Function App → Document Intelligence (parsing). -- Function App → Vision (figure analysis and captioning). -- Function App → Search (index writing). - -### Network Security - -Optional private networking: - -- Functions can be deployed in a Virtual Network. -- Private endpoints for Storage, OpenAI, Document Intelligence, Vision. -- Network isolation for production workloads. - -### Access Control - -Custom skills authenticate with **Microsoft Entra ID** using managed identities: - -- Azure AI Search calls each function using its system- or user-assigned managed identity and the skill's `authResourceId`. -- Each Function App enables App Service Authentication (Easy Auth) and trusts tokens issued for the registered application ID. -- Disable or avoid distributing function keys; they are unnecessary when managed identity is configured. - -## Performance - -### Throughput - -**Expected performance:** - -- Document extraction: 1-2 minutes per document (with multimodal). -- Figure processing: 10-20 seconds per document (depends on vision workload). -- Text processing and embedding: 10-30 seconds per document. -- End-to-end: 2-3 minutes per document. - -**Scaling:** - -- Indexer batch size: 10 documents (configurable). -- Function instances: Auto-scale based on load. -- Max concurrent executions: Limited by OpenAI TPM quota. - -### Optimization Tips - -1. **Batch uploads:** Upload multiple documents at once for parallel processing -2. **Pre-process documents:** Remove unnecessary content before upload -3. **Tune chunk size:** Balance between retrieval quality and processing time -4. **Use local parsers:** Faster but lower quality for simple documents - -## Migration from Local to Cloud Ingestion - -### Step-by-Step - -1. **Deploy infrastructure:** - - ```bash - azd env set USE_CLOUD_INGESTION true - azd provision - ``` - -1. **Test with sample documents:** - - ```bash - # Upload a few test documents - az storage blob upload-batch \ - -s ./data -d content \ - --account-name - ``` - -1. **Verify indexer runs:** - - Check Azure Portal → Search Service → Indexers - - Verify documents appear in index - -1. **Upload full dataset:** - - ```bash - ./scripts/prepdocs.sh --use-cloud-ingestion - ``` - -1. **Monitor progress:** - - Application Insights → Live Metrics - - Indexer execution history - -### Rollback - -To revert to local ingestion: - -```bash -azd env set USE_CLOUD_INGESTION false -./scripts/prepdocs.sh # Uses local processing -``` - -## References - -- [Azure AI Search Custom Skills](https://learn.microsoft.com/azure/search/cognitive-search-custom-skill-web-api) -- [Azure Functions Flex Consumption Plan](https://learn.microsoft.com/azure/azure-functions/flex-consumption-plan) -- [Azure AI Search Indexers](https://learn.microsoft.com/azure/search/search-indexer-overview) -- [Custom Skill Interface](https://learn.microsoft.com/azure/search/cognitive-search-custom-skill-interface) diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md index 18f9651c8b..beccbd55e6 100644 --- a/docs/data_ingestion.md +++ b/docs/data_ingestion.md @@ -14,6 +14,7 @@ The chat app provides two ways to ingest data: manual ingestion and cloud-based - [Indexing additional documents](#indexing-additional-documents) - [Removing documents](#removing-documents) - [Cloud-based ingestion](#cloud-based-ingestion) + - [Custom skills pipeline](#custom-skills-pipeline) - [Indexing of additional documents](#indexing-of-additional-documents) - [Removal of documents](#removal-of-documents) - [Scheduled indexing](#scheduled-indexing) @@ -139,17 +140,54 @@ You must first explicitly [enable cloud ingestion](./deploy_features.md#enabling This feature cannot be used on existing index. You need to create a new index or drop and recreate an existing index. In the newly created index schema, a new field 'parent_id' is added. This is used internally by the indexer to manage life cycle of chunks. +### Custom skills pipeline + +The cloud ingestion pipeline uses four Azure Functions as custom skills within an Azure AI Search indexer. Each function corresponds to a stage in the ingestion process. Here's how it works: + +1. **User uploads documents** to Azure Blob Storage (content container) +2. **Azure AI Search Indexer** monitors the blob container and orchestrates processing +3. **Custom skills** process documents through three stages: + - **Document Extractor** (Skill #1): Extracts text and figure metadata from source documents + - **Figure Processor** (Skill #2): Enriches figures with descriptions and embeddings + - **Shaper Skill** (Skill #3): Built-in Azure AI Search skill that consolidates enriched data + - **Text Processor** (Skill #4): Combines text with enriched figures, chunks content, and generates embeddings +4. **Azure AI Search Index** receives the final processed chunks with embeddings + +#### [Document Extractor Function](app/functions/document_extractor/) + +- Implements the [document extraction](#document-extraction) stage +- Emits markdown text with `
` placeholders and figure metadata + +#### [Figure Processor Function](app/functions/figure_processor/) + +- Implements the [figure processing](#figure-processing) stage +- Emits enriched figure metadata with descriptions, URLs, and embeddings + +#### [Shaper Skill](https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-shaper) + +- Consolidates enrichments from the figure processor back into the main document context +- Required because Azure AI Search's enrichment tree isolates data by context +- The Shaper explicitly combines: + - Original `pages` array from `document_extractor` + - Enriched `figures` array with descriptions, URLs, and embeddings from `figure_processor` + - File metadata (file_name, storageUrl) +- Creates a `consolidated_document` object that the text processor can consume + +#### [Text Processor Function](app/functions/text_processor/) + +- Implements the [text processing](#text-processing) stage (figure merging, chunking, embedding) +- Receives the consolidated document with enriched figures from the Shaper skill +- Emits search-ready chunks with figure references and embeddings + ### Indexing of additional documents To add additional documents to the index, first upload them to your data source (Blob storage, by default). -Then navigate to the Azure portal, find the index, and run it. -The Azure AI Search indexer will identify the new documents and ingest them into the index. +Then navigate to the Azure portal and run the indexer. The Azure AI Search indexer will identify the new documents and ingest them into the index. ### Removal of documents To remove documents from the index, remove them from your data source (Blob storage, by default). -Then navigate to the Azure portal, find the index, and run it. -The Azure AI Search indexer will take care of removing those documents from the index. +Then navigate to the Azure portal and run the indexer. The Azure AI Search indexer will take care of removing those documents from the index. ### Scheduled indexing diff --git a/scripts/compare_search_indexes.py b/scripts/compare_search_indexes.py deleted file mode 100644 index 78d8bf9fb7..0000000000 --- a/scripts/compare_search_indexes.py +++ /dev/null @@ -1,376 +0,0 @@ -"""Compare documents across two Azure AI Search indexes""" - -import argparse -import asyncio -import logging -import os -from collections.abc import Iterable, Mapping -from dataclasses import dataclass, field -from typing import Any, cast - -from azure.core.credentials_async import AsyncTokenCredential -from azure.identity.aio import AzureDeveloperCliCredential -from azure.search.documents.aio import SearchClient -from Levenshtein import ratio - -from load_azd_env import load_azd_env - -logger = logging.getLogger("scripts") - -IndexKey = tuple[str | None, str | None] - - -@dataclass -class IndexComparisonResult: - """Holds summary data for one index.""" - - index_name: str - total_documents: int - keys: set[IndexKey] - documents_by_key: dict[IndexKey, list[dict[str, Any]]] = field(default_factory=dict) - - -async def collect_index_documents( - *, endpoint: str, credential: AsyncTokenCredential, index_name: str -) -> IndexComparisonResult: - """Collect all documents grouped by (sourcefile, sourcepage) pairs for the specified index.""" - - keys: set[IndexKey] = set() - documents_by_key: dict[IndexKey, list[dict[str, Any]]] = {} - total_documents = 0 - - async with SearchClient(endpoint=endpoint, index_name=index_name, credential=credential) as client: - results = await client.search( - search_text="", - select="*", - include_total_count=True, - ) - async for doc in results: - document = cast(Mapping[str, Any], doc) - total_documents += 1 - sourcefile = document.get("sourcefile") - sourcepage = document.get("sourcepage") - key = (sourcefile, sourcepage) - keys.add(key) - if key not in documents_by_key: - documents_by_key[key] = [] - documents_by_key[key].append(dict(document)) - - return IndexComparisonResult( - index_name=index_name, total_documents=total_documents, keys=keys, documents_by_key=documents_by_key - ) - - -def parse_args() -> argparse.Namespace: - """Parse command-line arguments.""" - - parser = argparse.ArgumentParser( - description="Compare documents across two Azure AI Search indexes using sourcefile/sourcepage pairs.", - ) - parser.add_argument("first_index", help="Name of the first search index to compare.") - parser.add_argument("second_index", help="Name of the second search index to compare.") - return parser.parse_args() - - -def build_endpoint(service_name: str) -> str: - """Return the full endpoint URL for the Azure AI Search service.""" - - return f"https://{service_name}.search.windows.net" - - -def _match_chunks_by_similarity( - first_docs: list[dict[str, Any]], second_docs: list[dict[str, Any]] -) -> list[tuple[dict[str, Any], dict[str, Any], float]]: - """ - Match chunks from two document lists based on content similarity using Levenshtein ratio. - - Returns a list of tuples (doc1, doc2, similarity_score) where each doc1 is matched - to its best matching doc2 based on content similarity. - """ - matched_pairs = [] - used_second_indices = set() - - for doc1 in first_docs: - content1 = doc1.get("content", "") - best_match = None - best_similarity = 0.0 - best_idx = -1 - - # Find the best matching chunk from second_docs - for idx, doc2 in enumerate(second_docs): - if idx in used_second_indices: - continue - - content2 = doc2.get("content", "") - # Normalize whitespace for comparison - normalized1 = " ".join(str(content1).split()) - normalized2 = " ".join(str(content2).split()) - - # Calculate similarity ratio (0.0 to 1.0) - similarity = ratio(normalized1, normalized2) - - if similarity > best_similarity: - best_similarity = similarity - best_match = doc2 - best_idx = idx - - if best_match is not None: - matched_pairs.append((doc1, best_match, best_similarity)) - used_second_indices.add(best_idx) - else: - # No match found, pair with None - matched_pairs.append((doc1, {}, 0.0)) - - # Add any unmatched docs from second_docs - for idx, doc2 in enumerate(second_docs): - if idx not in used_second_indices: - matched_pairs.append(({}, doc2, 0.0)) - - return matched_pairs - - -async def compare_indexes( - *, first_index: str, second_index: str, endpoint: str, credential: AsyncTokenCredential -) -> None: - """Fetch documents from both indexes and report detailed field differences.""" - - first_result, second_result = await asyncio.gather( - collect_index_documents(endpoint=endpoint, credential=credential, index_name=first_index), - collect_index_documents(endpoint=endpoint, credential=credential, index_name=second_index), - ) - - missing_from_second = first_result.keys - second_result.keys - missing_from_first = second_result.keys - first_result.keys - - logger.info( - "Index '%s': %d docs, %d unique source pairs", - first_result.index_name, - first_result.total_documents, - len(first_result.keys), - ) - logger.info( - "Index '%s': %d docs, %d unique source pairs", - second_result.index_name, - second_result.total_documents, - len(second_result.keys), - ) - - def format_missing(pairs: Iterable[IndexKey]) -> str: - return "\n".join( - f" sourcefile={sourcefile or ''}, sourcepage={sourcepage or ''}" - for sourcefile, sourcepage in sorted(pairs) - ) - - if missing_from_second: - logger.warning( - "Pairs present in '%s' but missing in '%s':\n%s", - first_index, - second_index, - format_missing(missing_from_second), - ) - if missing_from_first: - logger.warning( - "Pairs present in '%s' but missing in '%s':\n%s", - second_index, - first_index, - format_missing(missing_from_first), - ) - - # Compare common keys for field differences - common_keys = first_result.keys & second_result.keys - if common_keys: - logger.info("Comparing %d common source pairs for field differences...", len(common_keys)) - differences_found = False - - for key in sorted(common_keys): - first_docs = first_result.documents_by_key[key] - second_docs = second_result.documents_by_key[key] - - if len(first_docs) != len(second_docs): - differences_found = True - logger.warning("\n=== MISMATCH for sourcefile=%s, sourcepage=%s ===", key[0], key[1]) - logger.warning( - " Document count: %s has %d chunks, %s has %d chunks", - first_index, - len(first_docs), - second_index, - len(second_docs), - ) - - # Match chunks by content similarity instead of position - matched_pairs = _match_chunks_by_similarity(first_docs, second_docs) - - # Compare field sets and values for each matched document pair - for idx, (doc1, doc2, similarity) in enumerate(matched_pairs): - # Skip if one or both documents are empty (unmatched) - if not doc1 or not doc2: - differences_found = True - logger.warning( - "\n=== UNMATCHED CHUNK for sourcefile=%s, sourcepage=%s ===", - key[0], - key[1], - ) - if not doc1: - logger.warning(" Chunk only in %s: ID=%s", second_index, doc2.get("id")) - if not doc2: - logger.warning(" Chunk only in %s: ID=%s", first_index, doc1.get("id")) - continue - - if similarity < 0.8: - logger.warning( - "\n=== LOW SIMILARITY MATCH for sourcefile=%s, sourcepage=%s (chunk pair %d) ===", - key[0], - key[1], - idx, - ) - logger.warning(" Content similarity: %.2f%%", similarity * 100) - logger.warning(" %s ID: %s", first_index, doc1.get("id")) - logger.warning(" %s ID: %s", second_index, doc2.get("id")) - - fields1 = set(doc1.keys()) - fields2 = set(doc2.keys()) - - missing_fields_in_second = fields1 - fields2 - missing_fields_in_first = fields2 - fields1 - - has_field_diff = missing_fields_in_second or missing_fields_in_first - has_value_diff = False - value_diffs: list[tuple[str, Any, Any]] = [] - embedding_diffs: list[tuple[str, int | None, int | None]] = [] - - # Get common fields first - common_fields = fields1 & fields2 - - # Compare embedding fields separately (dimension only, not values) - for field_name in sorted(common_fields): - if "embedding" in field_name.lower(): - val1 = doc1[field_name] - val2 = doc2[field_name] - dim1 = len(val1) if isinstance(val1, list) else None - dim2 = len(val2) if isinstance(val2, list) else None - if dim1 != dim2: - embedding_diffs.append((field_name, dim1, dim2)) - - # Compare values for common fields (excluding embeddings and large fields) - for field_name in sorted(common_fields): - # Skip embedding fields and other large binary/array fields - if "embedding" in field_name.lower() or field_name.startswith("@search"): - continue - - val1 = doc1[field_name] - val2 = doc2[field_name] - - # Special handling for images field - if field_name == "images": - if isinstance(val1, list) and isinstance(val2, list): - if len(val1) != len(val2): - has_value_diff = True - value_diffs.append((field_name, val1, val2)) - elif len(val1) > 0: - # Compare first image's non-embedding fields - img1_keys = set(val1[0].keys()) - {"embedding"} - img2_keys = set(val2[0].keys()) - {"embedding"} - if img1_keys != img2_keys: - has_value_diff = True - value_diffs.append((field_name, val1, val2)) - # Check image embedding dimensions - for img_idx, (img1, img2) in enumerate(zip(val1, val2)): - if "embedding" in img1 and "embedding" in img2: - emb1 = img1["embedding"] - emb2 = img2["embedding"] - dim1 = len(emb1) if isinstance(emb1, list) else None - dim2 = len(emb2) if isinstance(emb2, list) else None - if dim1 != dim2: - embedding_diffs.append((f"images[{img_idx}].embedding", dim1, dim2)) - elif val1 != val2: - has_value_diff = True - value_diffs.append((field_name, val1, val2)) - # Special handling for content field - normalize whitespace - elif field_name == "content": - normalized1 = " ".join(str(val1).split()) if val1 else "" - normalized2 = " ".join(str(val2).split()) if val2 else "" - if normalized1 != normalized2: - has_value_diff = True - value_diffs.append((field_name, val1, val2)) - elif val1 != val2: - has_value_diff = True - value_diffs.append((field_name, val1, val2)) - - if has_field_diff or has_value_diff or embedding_diffs: - differences_found = True - logger.warning( - "\n=== DIFFERENCE for sourcefile=%s, sourcepage=%s (chunk %d) ===", key[0], key[1], idx - ) - - if missing_fields_in_second: - logger.warning(" Fields only in %s: %s", first_index, sorted(missing_fields_in_second)) - if missing_fields_in_first: - logger.warning(" Fields only in %s: %s", second_index, sorted(missing_fields_in_first)) - - if embedding_diffs: - for field_name, dim1, dim2 in embedding_diffs: - logger.warning(" Embedding field '%s' dimension mismatch:", field_name) - logger.warning(" %s: %s dimensions", first_index, dim1) - logger.warning(" %s: %s dimensions", second_index, dim2) - - for field_name, val1, val2 in value_diffs: - logger.warning(" Field '%s':", field_name) - logger.warning(" %s: %s", first_index, _format_value(val1, field_name)) - logger.warning(" %s: %s", second_index, _format_value(val2, field_name)) - - if not differences_found: - logger.info("No field differences found for common source pairs.") - - if not missing_from_first and not missing_from_second and not differences_found: - logger.info("Indexes are identical.") - - -def _format_value(val: Any, field_name: str | None = None) -> str: - """Format a field value for logging, truncating if necessary.""" - if val is None: - return "" - if isinstance(val, str): - return val[:200] + "..." if len(val) > 200 else val - if isinstance(val, list): - # Special formatting for images field - if field_name == "images" and len(val) > 0 and isinstance(val[0], dict): - img_keys = sorted(set(val[0].keys()) - {"embedding"}) - return f"[{len(val)} images with fields: {img_keys}]" - return f"[{len(val)} items]" if len(val) > 5 else str(val) - return str(val) - - -async def main() -> None: - """Entry point for asynchronous execution.""" - - args = parse_args() - - load_azd_env() - - service_name = os.getenv("AZURE_SEARCH_SERVICE") - if not service_name: - raise RuntimeError( - "AZURE_SEARCH_SERVICE must be set. Run 'azd env get-values' or ensure azd environment is loaded." - ) - - endpoint = build_endpoint(service_name) - - tenant_id = os.getenv("AZURE_TENANT_ID") - credential = AzureDeveloperCliCredential(tenant_id=tenant_id) if tenant_id else AzureDeveloperCliCredential() - - try: - await compare_indexes( - first_index=args.first_index, - second_index=args.second_index, - endpoint=endpoint, - credential=credential, - ) - finally: - await credential.close() - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO, format="%(message)s") - logger.setLevel(logging.DEBUG) - - asyncio.run(main()) diff --git a/tests/test_servicesetup.py b/tests/test_servicesetup.py index f4f657eccb..b5d385fd84 100644 --- a/tests/test_servicesetup.py +++ b/tests/test_servicesetup.py @@ -307,6 +307,7 @@ def test_setup_parser_text_file(): file_name="test.txt", content_type="text/plain", azure_credential=MockAzureCredential(), + document_intelligence_service=None, ) assert isinstance(parser, TextParser) @@ -336,4 +337,5 @@ def test_setup_parser_unsupported_file_type(): file_name="test.xyz", content_type="application/xyz", azure_credential=MockAzureCredential(), + document_intelligence_service=None, ) From b733d20eca203ee546cc097a99c8541bd79e36fe Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 11 Nov 2025 15:01:21 -0800 Subject: [PATCH 19/30] Clean up vectorization docs and refs --- README.md | 2 +- app/backend/prepdocslib/cloudingestionstrategy.py | 2 +- app/backend/prepdocslib/searchmanager.py | 3 --- app/functions/document_extractor/function_app.py | 2 +- docs/data_ingestion.md | 6 ++++-- docs/deploy_features.md | 4 +--- docs/multimodal.md | 1 - 7 files changed, 8 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index ff4041c801..181573b13b 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ The repo includes sample data so it's ready to try end to end. In this sample ap - Chat (multi-turn) and Q&A (single turn) interfaces - Renders citations and thought process for each answer - Includes settings directly in the UI to tweak the behavior and experiment with options -- Integrates Azure AI Search for indexing and retrieval of documents, with support for [many document formats](/docs/data_ingestion.md#supported-document-formats) as well as [cloud-based data ingestion](/docs/data_ingestion.md#overview-of-cloud-based-vectorization) +- Integrates Azure AI Search for indexing and retrieval of documents, with support for [many document formats](/docs/data_ingestion.md#supported-document-formats) as well as [cloud-based data ingestion](/docs/data_ingestion.md#cloud-based-ingestion) - Optional usage of [multimodal models](/docs/multimodal.md) to reason over image-heavy documents - Optional addition of [speech input/output](/docs/deploy_features.md#enabling-speech-inputoutput) for accessibility - Optional automation of [user login and data access](/docs/login_and_acl.md) via Microsoft Entra diff --git a/app/backend/prepdocslib/cloudingestionstrategy.py b/app/backend/prepdocslib/cloudingestionstrategy.py index 745b48c042..4238600ddf 100644 --- a/app/backend/prepdocslib/cloudingestionstrategy.py +++ b/app/backend/prepdocslib/cloudingestionstrategy.py @@ -266,7 +266,7 @@ async def setup(self) -> None: logger.info("Setting up search index and skillset for cloud ingestion") if not self.embeddings.azure_endpoint or not self.embeddings.azure_deployment_name: - raise ValueError("Integrated vectorization requires Azure OpenAI endpoint and deployment") + raise ValueError("Cloud ingestion requires Azure OpenAI endpoint and deployment") if not isinstance(self.embeddings, OpenAIEmbeddings): raise TypeError("Cloud ingestion requires Azure OpenAI embeddings to configure the search index.") diff --git a/app/backend/prepdocslib/searchmanager.py b/app/backend/prepdocslib/searchmanager.py index fdcac5e9a6..f1de6bc0b8 100644 --- a/app/backend/prepdocslib/searchmanager.py +++ b/app/backend/prepdocslib/searchmanager.py @@ -69,9 +69,6 @@ def __init__( search_info: SearchInfo, search_analyzer_name: Optional[str] = None, use_acls: bool = False, - # Renamed from use_int_vectorization to use_parent_index_projection to reflect - # that this flag controls parent/child index projection (adding parent_id and - # enhanced key field settings) rather than any specific vectorization mode. use_parent_index_projection: bool = False, embeddings: Optional[OpenAIEmbeddings] = None, field_name_embedding: Optional[str] = None, diff --git a/app/functions/document_extractor/function_app.py b/app/functions/document_extractor/function_app.py index 958974a388..0f2758fd21 100644 --- a/app/functions/document_extractor/function_app.py +++ b/app/functions/document_extractor/function_app.py @@ -68,7 +68,7 @@ async def extract_document(req: func.HttpRequest) -> func.HttpResponse: Azure Search Custom Skill: Extract document content Input format (single record; file data only): - # https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-document-intelligence-layout#skill-inputs + # https://learn.microsoft.com/azure/search/cognitive-search-skill-document-intelligence-layout#skill-inputs { "values": [ { diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md index beccbd55e6..329b82934f 100644 --- a/docs/data_ingestion.md +++ b/docs/data_ingestion.md @@ -36,7 +36,7 @@ In order to ingest a document format, we need a tool that can turn it into text. ## Ingestion stages -The ingestion pipeline consists of three main stages that transform raw documents into searchable content in Azure AI Search. These stages apply to both local ingestion (using `prepdocs.py`) and cloud-based ingestion (using Azure Functions as custom skills). +The ingestion pipeline consists of three main stages that transform raw documents into searchable content in Azure AI Search. These stages apply to both [local ingestion](#local-ingestion) and [cloud-based ingestion](#cloud-based-ingestion). ### Document extraction @@ -153,6 +153,8 @@ The cloud ingestion pipeline uses four Azure Functions as custom skills within a - **Text Processor** (Skill #4): Combines text with enriched figures, chunks content, and generates embeddings 4. **Azure AI Search Index** receives the final processed chunks with embeddings +The functions are defined in the `app/functions/` directory, and the custom skillset is configured in the `app/backend/setup_cloud_ingestion.py` script. + #### [Document Extractor Function](app/functions/document_extractor/) - Implements the [document extraction](#document-extraction) stage @@ -163,7 +165,7 @@ The cloud ingestion pipeline uses four Azure Functions as custom skills within a - Implements the [figure processing](#figure-processing) stage - Emits enriched figure metadata with descriptions, URLs, and embeddings -#### [Shaper Skill](https://learn.microsoft.com/en-us/azure/search/cognitive-search-skill-shaper) +#### [Shaper Skill](https://learn.microsoft.com/azure/search/cognitive-search-skill-shaper) - Consolidates enrichments from the figure processor back into the main document context - Required because Azure AI Search's enrichment tree isolates data by context diff --git a/docs/deploy_features.md b/docs/deploy_features.md index 467a5991d6..0c63ca9b40 100644 --- a/docs/deploy_features.md +++ b/docs/deploy_features.md @@ -12,7 +12,6 @@ You should typically enable these features before running `azd up`. Once you've * [Enabling persistent chat history with Azure Cosmos DB](#enabling-persistent-chat-history-with-azure-cosmos-db) * [Enabling language picker](#enabling-language-picker) * [Enabling speech input/output](#enabling-speech-inputoutput) -* [Enabling Integrated Vectorization](#enabling-integrated-vectorization) * [Enabling authentication](#enabling-authentication) * [Enabling login and document level access control](#enabling-login-and-document-level-access-control) * [Enabling user document upload](#enabling-user-document-upload) @@ -236,8 +235,7 @@ Learn more in the [multimodal guide](./multimodal.md). ## Enabling media description with Azure Content Understanding -⚠️ This feature is not currently compatible with [integrated vectorization](#enabling-integrated-vectorization). -It is compatible with the [multimodal feature](./multimodal.md), but this feature enables only a subset of multimodal capabilities, +⚠️ This feature is compatible with the [multimodal feature](./multimodal.md), but this feature enables only a subset of multimodal capabilities, so you may want to enable the multimodal feature instead or as well. By default, if your documents contain image-like figures, the data ingestion process will ignore those figures, diff --git a/docs/multimodal.md b/docs/multimodal.md index b547cc1c37..24b7fb656f 100644 --- a/docs/multimodal.md +++ b/docs/multimodal.md @@ -112,5 +112,4 @@ and you may still see good results with just text inputs, since the inputs conta ## Compatibility -* This feature is **not** compatible with [integrated vectorization](./deploy_features.md#enabling-integrated-vectorization), as the currently configured built-in skills do not process images or store image embeddings. Azure AI Search does now offer built-in skills for multimodal support, as demonstrated in [azure-ai-search-multimodal-sample](https://github.com/Azure-Samples/azure-ai-search-multimodal-sample), but we have not integrated them in this project. Instead, we are working on making a custom skill based off the data ingestion code in this repository, and hosting that skill on Azure Functions. Stay tuned to the releases to find out when that's available. * This feature *is* compatible with the [reasoning models](./reasoning.md) feature, as long as you use a model that [supports image inputs](https://learn.microsoft.com/azure/ai-services/openai/how-to/reasoning?tabs=python-secure%2Cpy#api--feature-support). From 1db5f14c03e12f3123548508b091b7d6242a3e27 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 11 Nov 2025 15:29:05 -0800 Subject: [PATCH 20/30] More code cleanup --- app/backend/prepdocs.py | 8 +--- app/backend/prepdocslib/figureprocessor.py | 44 +++++++++---------- app/backend/prepdocslib/page.py | 30 ++----------- app/backend/prepdocslib/servicesetup.py | 7 +++ app/backend/prepdocslib/textprocessor.py | 33 +++----------- app/backend/setup_cloud_ingestion.py | 26 +++-------- .../figure_processor/function_app.py | 2 +- infra/main.bicep | 12 ++--- tests/test_pdfparser.py | 26 +++++------ tests/test_prepdocslib_filestrategy.py | 22 +++++----- 10 files changed, 73 insertions(+), 137 deletions(-) diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index d3f2cc71a2..5e6502a8db 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -26,6 +26,7 @@ from prepdocslib.parser import Parser from prepdocslib.servicesetup import ( OpenAIHost, + clean_key_if_exists, select_parser, setup_blob_manager, setup_embeddings_service, @@ -41,13 +42,6 @@ logger = logging.getLogger("scripts") -def clean_key_if_exists(key: Optional[str]) -> Optional[str]: - """Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None.""" - if key is not None and key.strip() != "": - return key.strip() - return None - - async def check_search_service_connectivity(search_service: str) -> bool: """Check if the search service is accessible by hitting the /ping endpoint.""" ping_url = f"https://{search_service}.search.windows.net/ping" diff --git a/app/backend/prepdocslib/figureprocessor.py b/app/backend/prepdocslib/figureprocessor.py index 8bdc2d7f40..ca2b4a9bfc 100644 --- a/app/backend/prepdocslib/figureprocessor.py +++ b/app/backend/prepdocslib/figureprocessor.py @@ -40,14 +40,14 @@ def __init__( openai_deployment: str | None = None, content_understanding_endpoint: str | None = None, ) -> None: - self._credential = credential + self.credential = credential self.strategy = strategy - self._openai_client = openai_client - self._openai_model = openai_model - self._openai_deployment = openai_deployment - self._content_understanding_endpoint = content_understanding_endpoint - self._media_describer: MediaDescriber | None = None - self._content_understanding_ready = False + self.openai_client = openai_client + self.openai_model = openai_model + self.openai_deployment = openai_deployment + self.content_understanding_endpoint = content_understanding_endpoint + self.media_describer: MediaDescriber | None = None + self.content_understanding_ready = False async def get_media_describer(self) -> MediaDescriber | None: """Return (and lazily create) the media describer for this processor.""" @@ -55,30 +55,28 @@ async def get_media_describer(self) -> MediaDescriber | None: if self.strategy == MediaDescriptionStrategy.NONE: return None - if self._media_describer is not None: - return self._media_describer + if self.media_describer is not None: + return self.media_describer if self.strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING: - if self._content_understanding_endpoint is None: + if self.content_understanding_endpoint is None: raise ValueError("Content Understanding strategy requires an endpoint") - if self._credential is None: + if self.credential is None: raise ValueError("Content Understanding strategy requires a credential") - if isinstance(self._credential, AzureKeyCredential): + if isinstance(self.credential, AzureKeyCredential): raise ValueError( "Content Understanding does not support key credentials; provide a token credential instead" ) - self._media_describer = ContentUnderstandingDescriber( - self._content_understanding_endpoint, self._credential - ) - return self._media_describer + self.media_describer = ContentUnderstandingDescriber(self.content_understanding_endpoint, self.credential) + return self.media_describer if self.strategy == MediaDescriptionStrategy.OPENAI: - if self._openai_client is None or self._openai_model is None: + if self.openai_client is None or self.openai_model is None: raise ValueError("OpenAI strategy requires both a client and a model name") - self._media_describer = MultimodalModelDescriber( - self._openai_client, model=self._openai_model, deployment=self._openai_deployment + self.media_describer = MultimodalModelDescriber( + self.openai_client, model=self.openai_model, deployment=self.openai_deployment ) - return self._media_describer + return self.media_describer logger.warning("Unknown media description strategy '%s'; skipping description", self.strategy) return None @@ -86,7 +84,7 @@ async def get_media_describer(self) -> MediaDescriber | None: def mark_content_understanding_ready(self) -> None: """Record that the Content Understanding analyzer exists to avoid recreating it.""" - self._content_understanding_ready = True + self.content_understanding_ready = True async def describe(self, image_bytes: bytes) -> str | None: """Generate a description for the provided image bytes if a describer is available.""" @@ -94,9 +92,9 @@ async def describe(self, image_bytes: bytes) -> str | None: describer = await self.get_media_describer() if describer is None: return None - if isinstance(describer, ContentUnderstandingDescriber) and not self._content_understanding_ready: + if isinstance(describer, ContentUnderstandingDescriber) and not self.content_understanding_ready: await describer.create_analyzer() - self._content_understanding_ready = True + self.content_understanding_ready = True return await describer.describe_image(image_bytes) diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index e9962fdf3e..41cfb0cc2c 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -21,47 +21,23 @@ def to_skill_payload( self, file_name: str, *, - include_bytes: bool = False, include_bytes_base64: bool = True, ) -> dict[str, Any]: - """Serialize this figure for the figure_processor skill output. - - Parameters: - file_name: Source document file name. - include_bytes: When True, include the raw ``bytes`` field. Defaults to False to avoid - bloating payload size and because JSON serialization of raw bytes is not desired. - include_bytes_base64: When True (default), include a base64 representation of the image - as ``bytes_base64`` for downstream skills that might still need the encoded image. - - Notes: - - Previous behavior always included both the raw bytes (via ``asdict``) and a base64 copy. - This is wasteful for typical pipelines where only the blob ``url`` plus lightweight - metadata are required. The new defaults favor minimal payload size. - - Callers needing the raw bytes can opt-in with ``include_bytes=True`` (e.g., for a - chained skill that has not yet persisted the blob or for debugging scenarios). - """ - data = asdict(self) - if not include_bytes and "bytes" in data: - # Remove raw bytes to keep payload lean (and JSON-friendly without extra handling). - data.pop("bytes", None) + # Remove raw bytes to keep payload lean (and JSON-friendly without extra handling). + data.pop("bytes", None) + # Optionally include base64-encoded bytes for skills that need it if include_bytes_base64: - # Always base64 from the current in-memory bytes, not from any cached version, to ensure fidelity. b = self.bytes if isinstance(self.bytes, (bytes, bytearray)) else b"" data["bytes_base64"] = base64.b64encode(b).decode("utf-8") - # Remove None values to prevent document extractor from emitting fields that will be - # enriched by figure processor, avoiding potential conflicts in Azure AI Search enrichment merge - data = {k: v for k, v in data.items() if v is not None} - data["document_file_name"] = file_name return data @classmethod def from_skill_payload(cls, data: dict[str, Any]) -> tuple["ImageOnPage", str]: - """Deserialize a figure skill payload into an ImageOnPage, normalizing fields.""" # Decode base64 image data (optional - may be omitted if already persisted to blob) bytes_base64 = data.get("bytes_base64") if bytes_base64: diff --git a/app/backend/prepdocslib/servicesetup.py b/app/backend/prepdocslib/servicesetup.py index 6dd03dd6c5..d0f4026e09 100644 --- a/app/backend/prepdocslib/servicesetup.py +++ b/app/backend/prepdocslib/servicesetup.py @@ -23,6 +23,13 @@ logger = logging.getLogger("scripts") +def clean_key_if_exists(key: Optional[str]) -> Optional[str]: + """Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None.""" + if key is not None and key.strip() != "": + return key.strip() + return None + + class OpenAIHost(str, Enum): """Supported OpenAI hosting styles. diff --git a/app/backend/prepdocslib/textprocessor.py b/app/backend/prepdocslib/textprocessor.py index d7d1d64aad..2895a80588 100644 --- a/app/backend/prepdocslib/textprocessor.py +++ b/app/backend/prepdocslib/textprocessor.py @@ -1,26 +1,18 @@ """Utilities for processing document text and combining it with figure descriptions.""" import logging -from typing import TYPE_CHECKING -if TYPE_CHECKING: # pragma: no cover - used only for type hints - from .listfilestrategy import File - from .page import Page - from .searchmanager import Section - from .textsplitter import TextSplitter +from .figureprocessor import build_figure_markup +from .listfilestrategy import File +from .page import Page +from .searchmanager import Section +from .textsplitter import TextSplitter logger = logging.getLogger("scripts") def combine_text_with_figures(page: "Page") -> None: - """Replace figure placeholders in page text with full description markup. - - This is Skill #3 (text_processor) in the three-skill pipeline. - After figures have been described and enriched, this replaces their - placeholders in the page text with the full
markup. - """ - from .figureprocessor import build_figure_markup - + """Replace figure placeholders in page text with full description markup.""" for image in page.images: if image.description and image.placeholder in page.text: figure_markup = build_figure_markup(image, image.description) @@ -39,22 +31,9 @@ def process_text( category: str | None = None, ) -> list["Section"]: """Process document text and figures into searchable sections. - - This is Skill #3 (text_processor) in the three-skill pipeline. Combines text with figure descriptions, splits into chunks, and associates figures with their containing sections. - - Args: - pages: List of parsed pages with enriched figures - file: Original file being processed - splitter: Text splitter for chunking content - category: Optional category for sections - - Returns: - List of Sections ready for indexing """ - from .searchmanager import Section - # Step 1: Combine text with figures on each page for page in pages: combine_text_with_figures(page) diff --git a/app/backend/setup_cloud_ingestion.py b/app/backend/setup_cloud_ingestion.py index 94e980850a..ac9e617d0e 100644 --- a/app/backend/setup_cloud_ingestion.py +++ b/app/backend/setup_cloud_ingestion.py @@ -3,7 +3,6 @@ import asyncio import logging import os -from typing import Optional from azure.core.credentials_async import AsyncTokenCredential from azure.identity.aio import AzureDeveloperCliCredential @@ -13,8 +12,10 @@ from load_azd_env import load_azd_env from prepdocslib.blobmanager import BlobManager from prepdocslib.cloudingestionstrategy import CloudIngestionStrategy +from prepdocslib.listfilestrategy import LocalListFileStrategy from prepdocslib.servicesetup import ( OpenAIHost, + clean_key_if_exists, setup_blob_manager, setup_embeddings_service, setup_openai_client, @@ -25,13 +26,6 @@ logger = logging.getLogger("scripts") -def clean_key_if_exists(key: Optional[str]) -> Optional[str]: - """Remove leading and trailing whitespace from a key if it exists. If the key is empty, return None.""" - if key is not None and key.strip() != "": - return key.strip() - return None - - async def setup_cloud_ingestion_strategy( azure_credential: AsyncTokenCredential, document_action: DocumentAction = DocumentAction.Add, @@ -107,10 +101,8 @@ async def setup_cloud_ingestion_strategy( disable_batch=False, ) - # Create a minimal list file strategy (cloud ingestion doesn't use file listing) - from prepdocslib.listfilestrategy import LocalListFileStrategy - - list_file_strategy = LocalListFileStrategy(path_pattern="", enable_global_documents=False) + # Create a list file strategy for uploading files from the data folder + list_file_strategy = LocalListFileStrategy(path_pattern="data/*", enable_global_documents=False) # Create the cloud ingestion strategy ingestion_strategy = CloudIngestionStrategy( @@ -174,13 +166,9 @@ async def main(): await ingestion_strategy.run() finally: - # Gracefully close any async clients/credentials - try: - await blob_manager.close_clients() - await openai_client.close() - await azd_credential.close() - except Exception as e: - logger.debug(f"Failed to close async clients cleanly: {e}") + await blob_manager.close_clients() + await openai_client.close() + await azd_credential.close() if __name__ == "__main__": diff --git a/app/functions/figure_processor/function_app.py b/app/functions/figure_processor/function_app.py index 6e72741e89..2399171d31 100644 --- a/app/functions/figure_processor/function_app.py +++ b/app/functions/figure_processor/function_app.py @@ -161,7 +161,7 @@ async def process_figure_request(req: func.HttpRequest) -> func.HttpResponse: image_embeddings_client=settings.image_embeddings, figure_processor=settings.figure_processor, ) - figure_payload = image_on_page.to_skill_payload(file_name, include_bytes_base64=False, include_bytes=False) + figure_payload = image_on_page.to_skill_payload(file_name, include_bytes_base64=False) output_values.append( { "recordId": record_id, diff --git a/infra/main.bicep b/infra/main.bicep index b30c9eb8e4..811dba0390 100644 --- a/infra/main.bicep +++ b/infra/main.bicep @@ -468,7 +468,7 @@ var appEnvVariables = { AZURE_SEARCH_SERVICE: searchService.outputs.name AZURE_SEARCH_SEMANTIC_RANKER: actualSearchServiceSemanticRankerLevel AZURE_SEARCH_QUERY_REWRITING: searchServiceQueryRewriting - AZURE_VISION_ENDPOINT: useMultimodal ? vision!.outputs.endpoint : '' + AZURE_VISION_ENDPOINT: useMultimodal ? vision.outputs.endpoint : '' AZURE_SEARCH_QUERY_LANGUAGE: searchQueryLanguage AZURE_SEARCH_QUERY_SPELLER: searchQuerySpeller AZURE_SEARCH_FIELD_NAME_EMBEDDING: searchFieldNameEmbedding @@ -656,7 +656,7 @@ module acaAuth 'core/host/container-apps-auth.bicep' = if (deploymentTarget == ' } } -// FUNCTION APPS FOR CLOUD INGESTION +// Optional Azure Functions for document ingestion and processing module functions 'app/functions.bicep' = if (useCloudIngestion) { name: 'functions' scope: resourceGroup @@ -1445,11 +1445,11 @@ output AZURE_OPENAI_EVAL_MODEL string = isAzureOpenAiHost && useEval ? eval.mode output AZURE_OPENAI_SEARCHAGENT_DEPLOYMENT string = isAzureOpenAiHost && useAgenticRetrieval ? searchAgent.deploymentName : '' output AZURE_OPENAI_SEARCHAGENT_MODEL string = isAzureOpenAiHost && useAgenticRetrieval ? searchAgent.modelName : '' output AZURE_OPENAI_REASONING_EFFORT string = defaultReasoningEffort -output AZURE_SPEECH_SERVICE_ID string = useSpeechOutputAzure ? speech!.outputs.resourceId : '' -output AZURE_SPEECH_SERVICE_LOCATION string = useSpeechOutputAzure ? speech!.outputs.location : '' +output AZURE_SPEECH_SERVICE_ID string = useSpeechOutputAzure ? speech.outputs.resourceId : '' +output AZURE_SPEECH_SERVICE_LOCATION string = useSpeechOutputAzure ? speech.outputs.location : '' -output AZURE_VISION_ENDPOINT string = useMultimodal ? vision!.outputs.endpoint : '' -output AZURE_CONTENTUNDERSTANDING_ENDPOINT string = useMediaDescriberAzureCU ? contentUnderstanding!.outputs.endpoint : '' +output AZURE_VISION_ENDPOINT string = useMultimodal ? vision.outputs.endpoint : '' +output AZURE_CONTENTUNDERSTANDING_ENDPOINT string = useMediaDescriberAzureCU ? contentUnderstanding.outputs.endpoint : '' output AZURE_DOCUMENTINTELLIGENCE_SERVICE string = documentIntelligence.outputs.name output AZURE_DOCUMENTINTELLIGENCE_RESOURCE_GROUP string = documentIntelligenceResourceGroup.name diff --git a/tests/test_pdfparser.py b/tests/test_pdfparser.py index 3d8e3ae3e7..23cd2dcabf 100644 --- a/tests/test_pdfparser.py +++ b/tests/test_pdfparser.py @@ -18,10 +18,16 @@ DocumentTable, DocumentTableCell, ) +from azure.core.credentials import AzureKeyCredential from azure.core.exceptions import HttpResponseError from PIL import Image, ImageChops -from prepdocslib.figureprocessor import FigureProcessor, MediaDescriptionStrategy +from prepdocslib.figureprocessor import ( + FigureProcessor, + MediaDescriptionStrategy, + build_figure_markup, + process_page_image, +) from prepdocslib.page import ImageOnPage from prepdocslib.pdfparser import DocumentAnalysisParser @@ -433,7 +439,7 @@ def __init__(self, endpoint, credential): result_first = await figure_processor.describe(b"image") assert result_first == "A diagram" - describer_instance = figure_processor._media_describer # type: ignore[attr-defined] + describer_instance = figure_processor.media_describer # type: ignore[attr-defined] assert isinstance(describer_instance, FakeDescriber) describer_instance.create_analyzer.assert_awaited_once() @@ -477,8 +483,6 @@ async def test_figure_processor_content_understanding_missing_credential(): @pytest.mark.asyncio async def test_figure_processor_content_understanding_key_credential(): - from azure.core.credentials import AzureKeyCredential - figure_processor = FigureProcessor( strategy=MediaDescriptionStrategy.CONTENTUNDERSTANDING, credential=AzureKeyCredential("fake_key"), @@ -501,7 +505,7 @@ async def test_figure_processor_openai_returns_describer(monkeypatch): describer = await figure_processor.get_media_describer() assert describer is not None - assert figure_processor._media_describer is describer + assert figure_processor.media_describer is describer # Second call should return the same instance describer2 = await figure_processor.get_media_describer() @@ -526,15 +530,13 @@ async def test_figure_processor_unknown_strategy(caplog): async def test_figure_processor_mark_content_understanding_ready(): figure_processor = FigureProcessor(strategy=MediaDescriptionStrategy.NONE) - assert not figure_processor._content_understanding_ready + assert not figure_processor.content_understanding_ready figure_processor.mark_content_understanding_ready() - assert figure_processor._content_understanding_ready + assert figure_processor.content_understanding_ready @pytest.mark.asyncio async def test_build_figure_markup_without_description(sample_image): - from prepdocslib.figureprocessor import build_figure_markup - sample_image.title = "Sample Figure" result = build_figure_markup(sample_image, description=None) @@ -543,8 +545,6 @@ async def test_build_figure_markup_without_description(sample_image): @pytest.mark.asyncio async def test_process_page_image_without_blob_manager(sample_image): - from prepdocslib.figureprocessor import process_page_image - with pytest.raises(ValueError, match="BlobManager must be provided"): await process_page_image( image=sample_image, @@ -556,7 +556,6 @@ async def test_process_page_image_without_blob_manager(sample_image): @pytest.mark.asyncio async def test_process_page_image_without_figure_processor(sample_image): - from prepdocslib.figureprocessor import process_page_image blob_manager = AsyncMock() blob_manager.upload_document_image = AsyncMock(return_value="https://example.com/image.png") @@ -576,7 +575,6 @@ async def test_process_page_image_without_figure_processor(sample_image): @pytest.mark.asyncio async def test_process_page_image_sets_description(sample_image): - from prepdocslib.figureprocessor import process_page_image blob_manager = AsyncMock() blob_manager.upload_document_image = AsyncMock(return_value="https://example.com/image.png") @@ -598,7 +596,6 @@ async def test_process_page_image_sets_description(sample_image): @pytest.mark.asyncio async def test_process_page_image_skips_upload_if_url_exists(sample_image): - from prepdocslib.figureprocessor import process_page_image sample_image.url = "https://existing.com/image.png" @@ -618,7 +615,6 @@ async def test_process_page_image_skips_upload_if_url_exists(sample_image): @pytest.mark.asyncio async def test_process_page_image_with_embeddings(sample_image): - from prepdocslib.figureprocessor import process_page_image blob_manager = AsyncMock() blob_manager.upload_document_image = AsyncMock(return_value="https://example.com/image.png") diff --git a/tests/test_prepdocslib_filestrategy.py b/tests/test_prepdocslib_filestrategy.py index d8e8543be7..556d074601 100644 --- a/tests/test_prepdocslib_filestrategy.py +++ b/tests/test_prepdocslib_filestrategy.py @@ -1,14 +1,19 @@ import os +from io import BytesIO import pytest from azure.search.documents.aio import SearchClient from prepdocslib.blobmanager import BlobManager +from prepdocslib.figureprocessor import FigureProcessor, MediaDescriptionStrategy from prepdocslib.fileprocessor import FileProcessor -from prepdocslib.filestrategy import FileStrategy +from prepdocslib.filestrategy import FileStrategy, parse_file from prepdocslib.listfilestrategy import ( ADLSGen2ListFileStrategy, + File, + LocalListFileStrategy, ) +from prepdocslib.page import ImageOnPage, Page from prepdocslib.strategy import SearchInfo from prepdocslib.textparser import TextParser from prepdocslib.textsplitter import SimpleTextSplitter @@ -105,11 +110,6 @@ async def mock_upload_documents(self, documents): @pytest.mark.asyncio async def test_parse_file_with_images(monkeypatch): """Test that parse_file processes images and logs appropriately.""" - from io import BytesIO - - from prepdocslib.filestrategy import parse_file - from prepdocslib.listfilestrategy import File - from prepdocslib.page import ImageOnPage, Page # Create a mock file mock_file = File(content=BytesIO(b"test content")) @@ -176,8 +176,6 @@ def mock_process_text(pages, file, splitter, category): @pytest.mark.asyncio async def test_file_strategy_setup_with_content_understanding(monkeypatch, mock_env): """Test that FileStrategy.setup() properly initializes content understanding.""" - from prepdocslib.figureprocessor import FigureProcessor, MediaDescriptionStrategy - from prepdocslib.listfilestrategy import LocalListFileStrategy # Create mock list strategy list_strategy = LocalListFileStrategy(path_pattern="*.txt") @@ -239,8 +237,8 @@ async def mock_create_index(self): await file_strategy.setup() # Verify content understanding was initialized during setup - assert figure_processor._media_describer is not None - assert isinstance(figure_processor._media_describer, MockContentUnderstandingDescriber) + assert figure_processor.media_describer is not None + assert isinstance(figure_processor.media_describer, MockContentUnderstandingDescriber) # create_analyzer should be called during setup for content understanding - assert figure_processor._media_describer.create_analyzer_called - assert figure_processor._content_understanding_ready + assert figure_processor.media_describer.create_analyzer_called + assert figure_processor.content_understanding_ready From 6d4e490a332544879851707670863dfdbf5dd597 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 11 Nov 2025 15:37:42 -0800 Subject: [PATCH 21/30] Address Copilot feedback on tests --- AGENTS.md | 25 +++++++-- app/backend/prepdocslib/figureprocessor.py | 6 +-- requirements-dev.txt | 1 - tests/test_servicesetup.py | 59 +++++++--------------- 4 files changed, 42 insertions(+), 49 deletions(-) diff --git a/AGENTS.md b/AGENTS.md index 0954e613c7..58e1388fc7 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -17,9 +17,28 @@ If necessary, edit this file to ensure it accurately reflects the current state * app/backend/approaches/prompts/chat_query_rewrite.prompty: Prompt used to rewrite the query based off search history into a better search query * app/backend/approaches/prompts/chat_query_rewrite_tools.json: Tools used by the query rewriting prompt * app/backend/approaches/prompts/chat_answer_question.prompty: Prompt used by the Chat approach to actually answer the question based off sources - * app/backend/prepdocslib/cloudingestionstrategy.py: Builds the Azure AI Search indexer and skillset for the cloud ingestion pipeline - * app/backend/prepdocslib/pdfparser.py: Uses Azure Document Intelligence to emit page text plus figure placeholders - * app/backend/prepdocslib/figureprocessor.py: Shared helper that generates figure descriptions for both local ingestion and the cloud figure-processor skill + * app/backend/prepdocslib: Contains the document ingestion library used by both local and cloud ingestion + * app/backend/prepdocslib/blobmanager.py: Manages uploads to Azure Blob Storage + * app/backend/prepdocslib/cloudingestionstrategy.py: Builds the Azure AI Search indexer and skillset for the cloud ingestion pipeline + * app/backend/prepdocslib/csvparser.py: Parses CSV files + * app/backend/prepdocslib/embeddings.py: Generates embeddings for text and images using Azure OpenAI + * app/backend/prepdocslib/figureprocessor.py: Generates figure descriptions for both local ingestion and the cloud figure-processor skill + * app/backend/prepdocslib/fileprocessor.py: Orchestrates parsing and chunking of individual files + * app/backend/prepdocslib/filestrategy.py: Strategy for uploading and indexing files (local ingestion) + * app/backend/prepdocslib/htmlparser.py: Parses HTML files + * app/backend/prepdocslib/integratedvectorizerstrategy.py: Strategy using Azure AI Search integrated vectorization + * app/backend/prepdocslib/jsonparser.py: Parses JSON files + * app/backend/prepdocslib/listfilestrategy.py: Lists files from local filesystem or Azure Data Lake + * app/backend/prepdocslib/mediadescriber.py: Interfaces for describing images (Azure OpenAI GPT-4o, Content Understanding) + * app/backend/prepdocslib/page.py: Data classes for pages, images, and chunks + * app/backend/prepdocslib/parser.py: Base parser interface + * app/backend/prepdocslib/pdfparser.py: Parses PDFs using Azure Document Intelligence or local parser + * app/backend/prepdocslib/searchmanager.py: Manages Azure AI Search index creation and updates + * app/backend/prepdocslib/servicesetup.py: Shared service setup helpers for OpenAI, embeddings, blob storage, etc. + * app/backend/prepdocslib/strategy.py: Base strategy interface for document ingestion + * app/backend/prepdocslib/textparser.py: Parses plain text and markdown files + * app/backend/prepdocslib/textprocessor.py: Processes text chunks for cloud ingestion (merges figures, generates embeddings) + * app/backend/prepdocslib/textsplitter.py: Splits text into chunks using different strategies * app/backend/app.py: The main entry point for the backend application. * app/functions: Azure Functions used for cloud ingestion custom skills (document extraction, figure processing, text processing). Each function bundles a synchronized copy of `prepdocslib`; run `python scripts/copy_prepdocslib.py` to refresh the local copies if you modify the library. * app/frontend: Contains the React frontend code, built with TypeScript, built with vite. diff --git a/app/backend/prepdocslib/figureprocessor.py b/app/backend/prepdocslib/figureprocessor.py index ca2b4a9bfc..b1e77ca6d4 100644 --- a/app/backend/prepdocslib/figureprocessor.py +++ b/app/backend/prepdocslib/figureprocessor.py @@ -114,9 +114,9 @@ async def process_page_image( *, image: "ImageOnPage", document_filename: str, - blob_manager: Optional["BaseBlobManager"], - image_embeddings_client: Optional["ImageEmbeddings"], - figure_processor: Optional["FigureProcessor"] = None, + blob_manager: Optional[BaseBlobManager], + image_embeddings_client: Optional[ImageEmbeddings], + figure_processor: Optional[FigureProcessor] = None, user_oid: Optional[str] = None, ) -> "ImageOnPage": """Generate description, upload image, and optionally compute embedding for a figure.""" diff --git a/requirements-dev.txt b/requirements-dev.txt index 963cd76694..edc9571a50 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -14,4 +14,3 @@ pip-tools mypy==1.14.1 diff_cover axe-playwright-python -python-Levenshtein diff --git a/tests/test_servicesetup.py b/tests/test_servicesetup.py index b5d385fd84..b41b27988f 100644 --- a/tests/test_servicesetup.py +++ b/tests/test_servicesetup.py @@ -1,11 +1,21 @@ +import openai import pytest +from openai.types.create_embedding_response import Usage +from prepdocslib.embeddings import OpenAIEmbeddings +from prepdocslib.figureprocessor import FigureProcessor, MediaDescriptionStrategy +from prepdocslib.pdfparser import DocumentAnalysisParser from prepdocslib.servicesetup import ( OpenAIHost, + select_parser, setup_blob_manager, setup_embeddings_service, + setup_figure_processor, + setup_image_embeddings_service, setup_openai_client, + setup_search_info, ) +from prepdocslib.textparser import TextParser from .mocks import ( MOCK_EMBEDDING_DIMENSIONS, @@ -38,9 +48,7 @@ def __init__( captured["subscription_id"] = subscription_id captured["image_container"] = image_container - import prepdocslib.servicesetup as servicesetup_module - - monkeypatch.setattr(servicesetup_module, "BlobManager", StubBlobManager) + monkeypatch.setattr("prepdocslib.servicesetup.BlobManager", StubBlobManager) result = setup_blob_manager( azure_credential=MockAzureCredential(), @@ -58,9 +66,6 @@ def __init__( def test_setup_embeddings_service_populates_azure_metadata() -> None: - import openai - from openai.types.create_embedding_response import Usage - embeddings = setup_embeddings_service( open_ai_client=MockClient( MockEmbeddingsClient( @@ -79,17 +84,12 @@ def test_setup_embeddings_service_populates_azure_metadata() -> None: azure_openai_endpoint="https://service.openai.azure.com", ) - from prepdocslib.embeddings import OpenAIEmbeddings - assert isinstance(embeddings, OpenAIEmbeddings) assert embeddings.azure_deployment_name == "deployment" assert embeddings.azure_endpoint == "https://service.openai.azure.com" def test_setup_embeddings_service_requires_endpoint_for_azure() -> None: - import openai - from openai.types.create_embedding_response import Usage - with pytest.raises(ValueError): setup_embeddings_service( open_ai_client=MockClient( @@ -111,9 +111,6 @@ def test_setup_embeddings_service_requires_endpoint_for_azure() -> None: def test_setup_embeddings_service_requires_deployment_for_azure() -> None: - import openai - from openai.types.create_embedding_response import Usage - with pytest.raises(ValueError): setup_embeddings_service( open_ai_client=MockClient( @@ -142,10 +139,10 @@ class StubAsyncOpenAI: def __init__(self, *, base_url: str, api_key, **kwargs) -> None: captured_base_url.append(base_url) - import prepdocslib.servicesetup as servicesetup_module - - monkeypatch.setattr(servicesetup_module, "AsyncOpenAI", StubAsyncOpenAI) - monkeypatch.setattr(servicesetup_module, "get_bearer_token_provider", lambda *args, **kwargs: lambda: "fake_token") + monkeypatch.setattr("prepdocslib.servicesetup.AsyncOpenAI", StubAsyncOpenAI) + monkeypatch.setattr( + "prepdocslib.servicesetup.get_bearer_token_provider", lambda *args, **kwargs: lambda: "fake_token" + ) client, endpoint = setup_openai_client( openai_host=OpenAIHost.AZURE, @@ -167,9 +164,7 @@ class StubAsyncOpenAI: def __init__(self, *, base_url: str, api_key, **kwargs) -> None: captured_base_url.append(base_url) - import prepdocslib.servicesetup as servicesetup_module - - monkeypatch.setattr(servicesetup_module, "AsyncOpenAI", StubAsyncOpenAI) + monkeypatch.setattr("prepdocslib.servicesetup.AsyncOpenAI", StubAsyncOpenAI) client, endpoint = setup_openai_client( openai_host=OpenAIHost.AZURE_CUSTOM, @@ -192,9 +187,7 @@ class StubAsyncOpenAI: def __init__(self, *, base_url: str, api_key: str, **kwargs) -> None: captured_api_key.append(api_key) - import prepdocslib.servicesetup as servicesetup_module - - monkeypatch.setattr(servicesetup_module, "AsyncOpenAI", StubAsyncOpenAI) + monkeypatch.setattr("prepdocslib.servicesetup.AsyncOpenAI", StubAsyncOpenAI) client, endpoint = setup_openai_client( openai_host=OpenAIHost.AZURE, @@ -238,8 +231,6 @@ def test_setup_openai_client_azure_custom_requires_url() -> None: def test_setup_search_info_agentic_retrieval_without_model(): """Test that setup_search_info raises ValueError when using agentic retrieval without search agent model.""" - from prepdocslib.servicesetup import setup_search_info - with pytest.raises(ValueError, match="SearchAgent model must be specified"): setup_search_info( azure_credential=MockAzureCredential(), @@ -252,8 +243,6 @@ def test_setup_search_info_agentic_retrieval_without_model(): def test_setup_image_embeddings_multimodal_without_vision(): """Test that setup_image_embeddings_service raises ValueError when using multimodal without vision endpoint.""" - from prepdocslib.servicesetup import setup_image_embeddings_service - with pytest.raises(ValueError, match="Azure AI Vision endpoint must be provided"): setup_image_embeddings_service( use_multimodal=True, @@ -264,9 +253,6 @@ def test_setup_image_embeddings_multimodal_without_vision(): def test_setup_figure_processor_content_understanding(): """Test that setup_figure_processor returns correct processor for content understanding.""" - from prepdocslib.figureprocessor import FigureProcessor, MediaDescriptionStrategy - from prepdocslib.servicesetup import setup_figure_processor - processor = setup_figure_processor( use_multimodal=False, use_content_understanding=True, @@ -283,9 +269,6 @@ def test_setup_figure_processor_content_understanding(): def test_setup_parser_document_intelligence_with_key(): """Test that select_parser uses key credential when provided.""" - from prepdocslib.pdfparser import DocumentAnalysisParser - from prepdocslib.servicesetup import select_parser - parser = select_parser( file_name="test.pdf", content_type="application/pdf", @@ -300,9 +283,6 @@ def test_setup_parser_document_intelligence_with_key(): def test_setup_parser_text_file(): """Test that select_parser returns TextParser for text files.""" - from prepdocslib.servicesetup import select_parser - from prepdocslib.textparser import TextParser - parser = select_parser( file_name="test.txt", content_type="text/plain", @@ -315,9 +295,6 @@ def test_setup_parser_text_file(): def test_setup_parser_application_type_with_di(): """Test that select_parser uses DI for application/* content types.""" - from prepdocslib.pdfparser import DocumentAnalysisParser - from prepdocslib.servicesetup import select_parser - parser = select_parser( file_name="test.unknown", content_type="application/unknown", @@ -330,8 +307,6 @@ def test_setup_parser_application_type_with_di(): def test_setup_parser_unsupported_file_type(): """Test that select_parser raises ValueError for unsupported file types.""" - from prepdocslib.servicesetup import select_parser - with pytest.raises(ValueError, match="Unsupported file type"): select_parser( file_name="test.xyz", From 9fcaa557ebb794009dd652587b4aaf98a0a723d9 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 11 Nov 2025 22:45:54 -0800 Subject: [PATCH 22/30] More code cleanups --- app/backend/prepdocs.py | 75 ++-------- app/backend/prepdocslib/servicesetup.py | 131 ++++++++++-------- .../document_extractor/function_app.py | 37 ++--- app/functions/text_processor/function_app.py | 27 +++- tests/test_function_apps.py | 106 ++++++++++---- tests/test_servicesetup.py | 65 ++++----- 6 files changed, 242 insertions(+), 199 deletions(-) diff --git a/app/backend/prepdocs.py b/app/backend/prepdocs.py index 5e6502a8db..df23e80542 100644 --- a/app/backend/prepdocs.py +++ b/app/backend/prepdocs.py @@ -11,23 +11,19 @@ from rich.logging import RichHandler from load_azd_env import load_azd_env -from prepdocslib.csvparser import CsvParser -from prepdocslib.fileprocessor import FileProcessor from prepdocslib.filestrategy import FileStrategy from prepdocslib.integratedvectorizerstrategy import ( IntegratedVectorizerStrategy, ) -from prepdocslib.jsonparser import JsonParser from prepdocslib.listfilestrategy import ( ADLSGen2ListFileStrategy, ListFileStrategy, LocalListFileStrategy, ) -from prepdocslib.parser import Parser from prepdocslib.servicesetup import ( OpenAIHost, + build_file_processors, clean_key_if_exists, - select_parser, setup_blob_manager, setup_embeddings_service, setup_figure_processor, @@ -36,8 +32,6 @@ setup_search_info, ) from prepdocslib.strategy import DocumentAction, Strategy -from prepdocslib.textparser import TextParser -from prepdocslib.textsplitter import SentenceTextSplitter, SimpleTextSplitter logger = logging.getLogger("scripts") @@ -100,61 +94,20 @@ def setup_file_processors( openai_deployment: Optional[str] = None, content_understanding_endpoint: Optional[str] = None, ): - sentence_text_splitter = SentenceTextSplitter() + """Setup file processors and figure processor for document ingestion. + + Uses build_file_processors from servicesetup to ensure consistent parser/splitter + selection logic with the Azure Functions cloud ingestion pipeline. + """ + file_processors = build_file_processors( + azure_credential=azure_credential, + document_intelligence_service=document_intelligence_service, + document_intelligence_key=document_intelligence_key, + use_local_pdf_parser=local_pdf_parser, + use_local_html_parser=local_html_parser, + process_figures=use_multimodal, + ) - # Build mapping of file extensions to parsers using shared select_parser helper. - # Each select attempt may instantiate a DI parser; duplication is acceptable at startup. - def _try_select(ext: str, content_type: str) -> Parser | None: - file_name = f"dummy{ext}" - try: - return select_parser( - file_name=file_name, - content_type=content_type, - azure_credential=azure_credential, - document_intelligence_service=document_intelligence_service, - document_intelligence_key=document_intelligence_key, - process_figures=use_multimodal, - use_local_pdf_parser=local_pdf_parser, - use_local_html_parser=local_html_parser, - ) - except ValueError: - return None - - pdf_parser: Parser | None = _try_select(".pdf", "application/pdf") - html_parser: Parser | None = _try_select(".html", "text/html") - - # DI-only formats - di_exts = [ - ".docx", - ".pptx", - ".xlsx", - ".png", - ".jpg", - ".jpeg", - ".tiff", - ".bmp", - ".heic", - ] - di_parsers: dict[str, Parser] = {} - for ext in di_exts: - parser = _try_select(ext, "application/octet-stream") - if parser is not None: - di_parsers[ext] = parser - - # These file formats can always be parsed: - file_processors = { - ".json": FileProcessor(JsonParser(), SimpleTextSplitter()), - ".md": FileProcessor(TextParser(), sentence_text_splitter), - ".txt": FileProcessor(TextParser(), sentence_text_splitter), - ".csv": FileProcessor(CsvParser(), sentence_text_splitter), - } - # These require either a Python package or Document Intelligence - if pdf_parser is not None: - file_processors[".pdf"] = FileProcessor(pdf_parser, sentence_text_splitter) - if html_parser is not None: - file_processors[".html"] = FileProcessor(html_parser, sentence_text_splitter) - for ext, parser in di_parsers.items(): - file_processors[ext] = FileProcessor(parser, sentence_text_splitter) figure_processor = setup_figure_processor( credential=azure_credential, use_multimodal=use_multimodal, diff --git a/app/backend/prepdocslib/servicesetup.py b/app/backend/prepdocslib/servicesetup.py index d0f4026e09..fce4ad7d21 100644 --- a/app/backend/prepdocslib/servicesetup.py +++ b/app/backend/prepdocslib/servicesetup.py @@ -12,13 +12,17 @@ from openai import AsyncOpenAI from .blobmanager import BlobManager +from .csvparser import CsvParser from .embeddings import ImageEmbeddings, OpenAIEmbeddings from .figureprocessor import FigureProcessor, MediaDescriptionStrategy +from .fileprocessor import FileProcessor from .htmlparser import LocalHTMLParser +from .jsonparser import JsonParser from .parser import Parser from .pdfparser import DocumentAnalysisParser, LocalPdfParser from .strategy import SearchInfo from .textparser import TextParser +from .textsplitter import SentenceTextSplitter, SimpleTextSplitter logger = logging.getLogger("scripts") @@ -241,77 +245,92 @@ def setup_figure_processor( return None -def select_parser( +def build_file_processors( *, - file_name: str, - content_type: str, azure_credential: AsyncTokenCredential, document_intelligence_service: str | None, document_intelligence_key: str | None = None, - process_figures: bool = False, use_local_pdf_parser: bool = False, use_local_html_parser: bool = False, -) -> Parser: - """Return a parser instance appropriate for the file type and configuration. - - Args: - file_name: Source filename (used to derive extension) - content_type: MIME type (fallback for extension-based selection) - azure_credential: Token credential for DI service - document_intelligence_service: Name of DI service (None disables DI) - document_intelligence_key: Optional key credential (overrides token when provided) - process_figures: Whether figure extraction should be enabled in DI parser - use_local_pdf_parser: Force local PDF parsing instead of DI - use_local_html_parser: Force local HTML parsing instead of DI - - Returns: - Parser capable of yielding Page objects for the document. - - Raises: - ValueError: Unsupported file type or missing DI configuration for required formats. - """ - extension = file_name.lower().rsplit(".", 1)[-1] if "." in file_name else "" - ext_with_dot = f".{extension}" if extension else "" + process_figures: bool = False, +) -> dict[str, FileProcessor]: + sentence_text_splitter = SentenceTextSplitter() - # Build DI parser lazily only if needed - di_parser: DocumentAnalysisParser | None = None + doc_int_parser: Optional[DocumentAnalysisParser] = None + # check if Azure Document Intelligence credentials are provided if document_intelligence_service: credential: AsyncTokenCredential | AzureKeyCredential if document_intelligence_key: credential = AzureKeyCredential(document_intelligence_key) else: credential = azure_credential - di_parser = DocumentAnalysisParser( + doc_int_parser = DocumentAnalysisParser( endpoint=f"https://{document_intelligence_service}.cognitiveservices.azure.com/", credential=credential, process_figures=process_figures, ) - # Plain text / structured text formats always local - if ext_with_dot in {".txt", ".md", ".csv", ".json"} or content_type.startswith("text/plain"): - return TextParser() - - # HTML - if ext_with_dot in {".html", ".htm"} or content_type in {"text/html", "application/html"}: - if use_local_html_parser or not di_parser: - return LocalHTMLParser() - return di_parser - - # PDF - if ext_with_dot == ".pdf": - if use_local_pdf_parser or not di_parser: - return LocalPdfParser() - return di_parser - - # Formats requiring DI - di_required_exts = {".docx", ".pptx", ".xlsx", ".png", ".jpg", ".jpeg", ".tiff", ".bmp", ".heic"} - if ext_with_dot in di_required_exts: - if not di_parser: - raise ValueError("Document Intelligence service must be configured to process this file type") - return di_parser - - # Fallback: if MIME suggests application/* and DI available, use DI - if content_type.startswith("application/") and di_parser: - return di_parser - - raise ValueError(f"Unsupported file type: {file_name}") + pdf_parser: Optional[Parser] = None + if use_local_pdf_parser or document_intelligence_service is None: + pdf_parser = LocalPdfParser() + elif document_intelligence_service is not None: + pdf_parser = doc_int_parser + else: + logger.warning("No PDF parser available") + + html_parser: Optional[Parser] = None + if use_local_html_parser or document_intelligence_service is None: + html_parser = LocalHTMLParser() + elif document_intelligence_service is not None: + html_parser = doc_int_parser + else: + logger.warning("No HTML parser available") + + # These file formats can always be parsed: + file_processors = { + ".json": FileProcessor(JsonParser(), SimpleTextSplitter()), + ".md": FileProcessor(TextParser(), sentence_text_splitter), + ".txt": FileProcessor(TextParser(), sentence_text_splitter), + ".csv": FileProcessor(CsvParser(), sentence_text_splitter), + } + # These require either a Python package or Document Intelligence + if pdf_parser is not None: + file_processors.update({".pdf": FileProcessor(pdf_parser, sentence_text_splitter)}) + if html_parser is not None: + file_processors.update({".html": FileProcessor(html_parser, sentence_text_splitter)}) + # These file formats require Document Intelligence + if doc_int_parser is not None: + file_processors.update( + { + ".docx": FileProcessor(doc_int_parser, sentence_text_splitter), + ".pptx": FileProcessor(doc_int_parser, sentence_text_splitter), + ".xlsx": FileProcessor(doc_int_parser, sentence_text_splitter), + ".png": FileProcessor(doc_int_parser, sentence_text_splitter), + ".jpg": FileProcessor(doc_int_parser, sentence_text_splitter), + ".jpeg": FileProcessor(doc_int_parser, sentence_text_splitter), + ".tiff": FileProcessor(doc_int_parser, sentence_text_splitter), + ".bmp": FileProcessor(doc_int_parser, sentence_text_splitter), + ".heic": FileProcessor(doc_int_parser, sentence_text_splitter), + } + ) + return file_processors + + +def select_processor_for_filename(file_name: str, file_processors: dict[str, FileProcessor]) -> FileProcessor: + """Select the appropriate file processor for a given filename. + + Args: + file_name: Name of the file to process + file_processors: Dictionary mapping file extensions to FileProcessor instances + + Returns: + FileProcessor instance for the file + + Raises: + ValueError: If the file extension is not supported + """ + file_ext = os.path.splitext(file_name)[1].lower() + file_processor = file_processors.get(file_ext) + if not file_processor: + raise ValueError(f"Unsupported file type: {file_name}") + return file_processor diff --git a/app/functions/document_extractor/function_app.py b/app/functions/document_extractor/function_app.py index 0f2758fd21..83e0bd3f85 100644 --- a/app/functions/document_extractor/function_app.py +++ b/app/functions/document_extractor/function_app.py @@ -15,8 +15,12 @@ from azure.core.exceptions import HttpResponseError from azure.identity.aio import ManagedIdentityCredential +from prepdocslib.fileprocessor import FileProcessor from prepdocslib.page import Page -from prepdocslib.servicesetup import select_parser +from prepdocslib.servicesetup import ( + build_file_processors, + select_processor_for_filename, +) app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS) @@ -25,10 +29,7 @@ @dataclass class GlobalSettings: - use_local_pdf_parser: bool - use_local_html_parser: bool - use_multimodal: bool - document_intelligence_service: str | None + file_processors: dict[str, FileProcessor] azure_credential: ManagedIdentityCredential @@ -52,11 +53,18 @@ def configure_global_settings(): logger.info("Using default Managed Identity without client ID") azure_credential = ManagedIdentityCredential() - settings = GlobalSettings( + # Build file processors dict for parser selection + file_processors = build_file_processors( + azure_credential=azure_credential, + document_intelligence_service=document_intelligence_service, + document_intelligence_key=None, use_local_pdf_parser=use_local_pdf_parser, use_local_html_parser=use_local_html_parser, - use_multimodal=use_multimodal, - document_intelligence_service=document_intelligence_service, + process_figures=use_multimodal, + ) + + settings = GlobalSettings( + file_processors=file_processors, azure_credential=azure_credential, ) @@ -176,16 +184,9 @@ async def process_document(data: dict[str, Any]) -> dict[str, Any]: document_stream, file_name, content_type = get_document_stream_filedata(data) logger.info("Processing document: %s", file_name) - parser = select_parser( - file_name=file_name, - content_type=content_type, - azure_credential=settings.azure_credential, - document_intelligence_service=settings.document_intelligence_service, - document_intelligence_key=None, - process_figures=settings.use_multimodal, - use_local_pdf_parser=settings.use_local_pdf_parser, - use_local_html_parser=settings.use_local_html_parser, - ) + # Get parser from file_processors dict based on file extension + file_processor = select_processor_for_filename(file_name, settings.file_processors) + parser = file_processor.parser pages: list[Page] = [] try: diff --git a/app/functions/text_processor/function_app.py b/app/functions/text_processor/function_app.py index 3a842f730b..0a3e15cd9f 100644 --- a/app/functions/text_processor/function_app.py +++ b/app/functions/text_processor/function_app.py @@ -14,15 +14,17 @@ from prepdocslib.blobmanager import BlobManager from prepdocslib.embeddings import OpenAIEmbeddings +from prepdocslib.fileprocessor import FileProcessor from prepdocslib.listfilestrategy import File from prepdocslib.page import ImageOnPage, Page from prepdocslib.servicesetup import ( OpenAIHost, + build_file_processors, + select_processor_for_filename, setup_embeddings_service, setup_openai_client, ) from prepdocslib.textprocessor import process_text -from prepdocslib.textsplitter import SentenceTextSplitter # Mark the function as anonymous since we are protecting it with built-in auth instead app = func.FunctionApp(http_auth_level=func.AuthLevel.ANONYMOUS) @@ -35,7 +37,7 @@ class GlobalSettings: use_vectors: bool use_multimodal: bool embedding_dimensions: int - sentence_splitter: SentenceTextSplitter + file_processors: dict[str, FileProcessor] embedding_service: OpenAIEmbeddings | None @@ -56,8 +58,7 @@ def configure_global_settings(): azure_openai_custom_url = os.getenv("AZURE_OPENAI_CUSTOM_URL") azure_openai_emb_deployment = os.getenv("AZURE_OPENAI_EMB_DEPLOYMENT") azure_openai_emb_model_name = os.getenv("AZURE_OPENAI_EMB_MODEL_NAME", "text-embedding-3-large") - - sentence_splitter = SentenceTextSplitter() + document_intelligence_service = os.getenv("AZURE_DOCUMENTINTELLIGENCE_SERVICE") # Single shared managed identity credential if AZURE_CLIENT_ID := os.getenv("AZURE_CLIENT_ID"): @@ -67,6 +68,16 @@ def configure_global_settings(): logger.info("Using default Managed Identity without client ID") azure_credential = ManagedIdentityCredential() + # Build file processors to get correct splitter for each file type + file_processors = build_file_processors( + azure_credential=azure_credential, + document_intelligence_service=document_intelligence_service, + document_intelligence_key=None, + use_local_pdf_parser=False, + use_local_html_parser=False, + process_figures=use_multimodal, + ) + # Embedding service (optional) embedding_service = None if use_vectors: @@ -95,7 +106,7 @@ def configure_global_settings(): use_vectors=use_vectors, use_multimodal=use_multimodal, embedding_dimensions=embedding_dimensions, - sentence_splitter=sentence_splitter, + file_processors=file_processors, embedding_service=embedding_service, ) @@ -215,7 +226,11 @@ async def process_document(data: dict[str, Any]) -> list[dict[str, Any]]: dummy_stream.name = file_name file_wrapper = File(content=dummy_stream) - sections = process_text(pages, file_wrapper, settings.sentence_splitter, category=None) + # Get the appropriate splitter for this file type + file_processor = select_processor_for_filename(file_name, settings.file_processors) + splitter = file_processor.splitter + + sections = process_text(pages, file_wrapper, splitter, category=None) if not sections: return [] diff --git a/tests/test_function_apps.py b/tests/test_function_apps.py index 7db436eef9..3abac2ef9b 100644 --- a/tests/test_function_apps.py +++ b/tests/test_function_apps.py @@ -74,16 +74,18 @@ async def parse(self, content: Any): page_text = f"# Heading\n\n{placeholder}\n\nConclusion." page = document_extractor.Page(page_num=0, offset=0, text=page_text, images=[figure]) - # Set up mock settings + # Set up mock file processors and settings + from prepdocslib.fileprocessor import FileProcessor + + mock_file_processors = { + ".pdf": FileProcessor(StubParser([page]), None), + } + mock_settings = document_extractor.GlobalSettings( - use_local_pdf_parser=False, - use_local_html_parser=False, - use_multimodal=False, - document_intelligence_service=None, + file_processors=mock_file_processors, azure_credential=object(), ) monkeypatch.setattr(document_extractor, "settings", mock_settings) - monkeypatch.setattr(document_extractor, "select_parser", lambda **_: StubParser([page])) request_payload = { "values": [ @@ -121,11 +123,10 @@ async def parse(self, content: Any): @pytest.mark.asyncio async def test_document_extractor_requires_single_record(monkeypatch: pytest.MonkeyPatch) -> None: + from prepdocslib.fileprocessor import FileProcessor + mock_settings = document_extractor.GlobalSettings( - use_local_pdf_parser=False, - use_local_html_parser=False, - use_multimodal=False, - document_intelligence_service=None, + file_processors={".pdf": FileProcessor(None, None)}, azure_credential=object(), ) monkeypatch.setattr(document_extractor, "settings", mock_settings) @@ -137,14 +138,13 @@ async def test_document_extractor_requires_single_record(monkeypatch: pytest.Mon @pytest.mark.asyncio async def test_document_extractor_handles_processing_exception(monkeypatch: pytest.MonkeyPatch) -> None: + from prepdocslib.fileprocessor import FileProcessor + async def failing_process(data: dict[str, Any]) -> dict[str, Any]: raise RuntimeError("boom") mock_settings = document_extractor.GlobalSettings( - use_local_pdf_parser=False, - use_local_html_parser=False, - use_multimodal=False, - document_intelligence_service=None, + file_processors={".pdf": FileProcessor(None, None)}, azure_credential=object(), ) monkeypatch.setattr(document_extractor, "settings", mock_settings) @@ -179,20 +179,22 @@ async def test_document_extractor_invalid_json_returns_error() -> None: @pytest.mark.asyncio async def test_document_extractor_process_document_http_error(monkeypatch: pytest.MonkeyPatch) -> None: + from prepdocslib.fileprocessor import FileProcessor + class FailingParser: async def parse(self, content): raise document_extractor.HttpResponseError(message="fail") yield # Make this an async generator + mock_file_processors = { + ".pdf": FileProcessor(FailingParser(), None), + } + mock_settings = document_extractor.GlobalSettings( - use_local_pdf_parser=False, - use_local_html_parser=False, - use_multimodal=False, - document_intelligence_service=None, + file_processors=mock_file_processors, azure_credential=object(), ) monkeypatch.setattr(document_extractor, "settings", mock_settings) - monkeypatch.setattr(document_extractor, "select_parser", lambda **_: FailingParser()) data = { "file_data": {"data": base64.b64encode(b"content").decode("utf-8")}, @@ -368,12 +370,20 @@ class StubEmbeddingService: async def create_embeddings(self, texts: list[str]) -> list[list[float]]: return [[0.41, 0.42, 0.43] for _ in texts] + # Set up mock file processors with stub splitter + from prepdocslib.fileprocessor import FileProcessor + from prepdocslib.textparser import TextParser + + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), StubSplitter()), + } + # Set up mock settings mock_settings = text_processor.GlobalSettings( use_vectors=True, use_multimodal=False, embedding_dimensions=3, - sentence_splitter=StubSplitter(), + file_processors=mock_file_processors, embedding_service=StubEmbeddingService(), ) monkeypatch.setattr(text_processor, "settings", mock_settings) @@ -537,7 +547,7 @@ async def test_text_processor_invalid_json(monkeypatch: pytest.MonkeyPatch) -> N use_multimodal=False, embedding_dimensions=1536, embedding_service=None, - sentence_splitter=object(), + file_processors={}, ) monkeypatch.setattr(text_processor, "settings", mock_settings) @@ -579,12 +589,19 @@ async def test_text_processor_embeddings_setup(monkeypatch: pytest.MonkeyPatch) @pytest.mark.asyncio async def test_text_processor_no_sections(monkeypatch: pytest.MonkeyPatch) -> None: """Test text processor handles empty sections.""" + from prepdocslib.fileprocessor import FileProcessor + from prepdocslib.textparser import TextParser + from prepdocslib.textsplitter import SentenceTextSplitter + + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), + } mock_settings = text_processor.GlobalSettings( use_vectors=False, use_multimodal=False, embedding_dimensions=1536, embedding_service=None, - sentence_splitter=object(), + file_processors=mock_file_processors, ) monkeypatch.setattr(text_processor, "settings", mock_settings) @@ -625,12 +642,19 @@ async def test_text_processor_embeddings_not_initialized(monkeypatch: pytest.Mon """Test text processor logs warning when embeddings requested but not initialized.""" import logging + from prepdocslib.fileprocessor import FileProcessor + from prepdocslib.textparser import TextParser + from prepdocslib.textsplitter import SentenceTextSplitter + + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), + } mock_settings = text_processor.GlobalSettings( use_vectors=True, # Request embeddings use_multimodal=False, embedding_dimensions=1536, embedding_service=None, # But no service - sentence_splitter=object(), + file_processors=mock_file_processors, ) monkeypatch.setattr(text_processor, "settings", mock_settings) @@ -666,12 +690,19 @@ def mock_process_text(pages, file, splitter, category): @pytest.mark.asyncio async def test_text_processor_empty_chunk_skipped(monkeypatch: pytest.MonkeyPatch) -> None: """Test text processor skips empty chunks.""" + from prepdocslib.fileprocessor import FileProcessor + from prepdocslib.textparser import TextParser + from prepdocslib.textsplitter import SentenceTextSplitter + + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), + } mock_settings = text_processor.GlobalSettings( use_vectors=False, use_multimodal=False, embedding_dimensions=1536, embedding_service=None, - sentence_splitter=object(), + file_processors=mock_file_processors, ) monkeypatch.setattr(text_processor, "settings", mock_settings) @@ -713,12 +744,19 @@ def mock_process_text(pages, file, splitter, category): @pytest.mark.asyncio async def test_text_processor_with_multimodal_embeddings(monkeypatch: pytest.MonkeyPatch) -> None: """Test text processor includes image embeddings when use_multimodal is true.""" + from prepdocslib.fileprocessor import FileProcessor + from prepdocslib.textparser import TextParser + from prepdocslib.textsplitter import SentenceTextSplitter + + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), + } mock_settings = text_processor.GlobalSettings( use_vectors=False, use_multimodal=True, embedding_dimensions=1536, embedding_service=None, - sentence_splitter=object(), + file_processors=mock_file_processors, ) monkeypatch.setattr(text_processor, "settings", mock_settings) @@ -774,6 +812,10 @@ async def test_text_processor_embedding_dimension_mismatch(monkeypatch: pytest.M """Test text processor logs warning when embedding dimensions don't match.""" import logging + from prepdocslib.fileprocessor import FileProcessor + from prepdocslib.textparser import TextParser + from prepdocslib.textsplitter import SentenceTextSplitter + mock_embedding_service = type("MockEmbeddingService", (), {})() async def mock_create_embeddings(texts): @@ -781,12 +823,15 @@ async def mock_create_embeddings(texts): mock_embedding_service.create_embeddings = mock_create_embeddings + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), + } mock_settings = text_processor.GlobalSettings( use_vectors=True, use_multimodal=False, embedding_dimensions=1536, # Expecting 1536 dimensions embedding_service=mock_embedding_service, - sentence_splitter=object(), + file_processors=mock_file_processors, ) monkeypatch.setattr(text_processor, "settings", mock_settings) @@ -824,6 +869,10 @@ async def test_text_processor_embeddings_missing_warning(monkeypatch: pytest.Mon """Test text processor logs warning when embeddings are requested but missing.""" import logging + from prepdocslib.fileprocessor import FileProcessor + from prepdocslib.textparser import TextParser + from prepdocslib.textsplitter import SentenceTextSplitter + mock_embedding_service = type("MockEmbeddingService", (), {})() async def mock_create_embeddings(texts): @@ -832,12 +881,15 @@ async def mock_create_embeddings(texts): mock_embedding_service.create_embeddings = mock_create_embeddings + mock_file_processors = { + ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), + } mock_settings = text_processor.GlobalSettings( use_vectors=True, use_multimodal=False, embedding_dimensions=1536, embedding_service=mock_embedding_service, - sentence_splitter=object(), + file_processors=mock_file_processors, ) monkeypatch.setattr(text_processor, "settings", mock_settings) diff --git a/tests/test_servicesetup.py b/tests/test_servicesetup.py index b41b27988f..53b5708c1b 100644 --- a/tests/test_servicesetup.py +++ b/tests/test_servicesetup.py @@ -7,7 +7,7 @@ from prepdocslib.pdfparser import DocumentAnalysisParser from prepdocslib.servicesetup import ( OpenAIHost, - select_parser, + build_file_processors, setup_blob_manager, setup_embeddings_service, setup_figure_processor, @@ -144,7 +144,7 @@ def __init__(self, *, base_url: str, api_key, **kwargs) -> None: "prepdocslib.servicesetup.get_bearer_token_provider", lambda *args, **kwargs: lambda: "fake_token" ) - client, endpoint = setup_openai_client( + _, endpoint = setup_openai_client( openai_host=OpenAIHost.AZURE, azure_credential=MockAzureCredential(), azure_openai_service="myopenaiservice", @@ -166,7 +166,7 @@ def __init__(self, *, base_url: str, api_key, **kwargs) -> None: monkeypatch.setattr("prepdocslib.servicesetup.AsyncOpenAI", StubAsyncOpenAI) - client, endpoint = setup_openai_client( + _, endpoint = setup_openai_client( openai_host=OpenAIHost.AZURE_CUSTOM, azure_credential=MockAzureCredential(), azure_openai_custom_url="https://custom.endpoint.com/openai", @@ -189,7 +189,7 @@ def __init__(self, *, base_url: str, api_key: str, **kwargs) -> None: monkeypatch.setattr("prepdocslib.servicesetup.AsyncOpenAI", StubAsyncOpenAI) - client, endpoint = setup_openai_client( + setup_openai_client( openai_host=OpenAIHost.AZURE, azure_credential=MockAzureCredential(), azure_openai_service="myopenaiservice", @@ -267,50 +267,53 @@ def test_setup_figure_processor_content_understanding(): assert processor.strategy == MediaDescriptionStrategy.CONTENTUNDERSTANDING -def test_setup_parser_document_intelligence_with_key(): - """Test that select_parser uses key credential when provided.""" - parser = select_parser( - file_name="test.pdf", - content_type="application/pdf", +def test_build_file_processors_with_document_intelligence_key(): + """Test that build_file_processors uses key credential when provided.""" + file_processors = build_file_processors( azure_credential=MockAzureCredential(), document_intelligence_service="myservice", document_intelligence_key="my-key", + use_local_pdf_parser=False, use_local_html_parser=False, ) - assert isinstance(parser, DocumentAnalysisParser) + assert ".pdf" in file_processors + assert isinstance(file_processors[".pdf"].parser, DocumentAnalysisParser) -def test_setup_parser_text_file(): - """Test that select_parser returns TextParser for text files.""" - parser = select_parser( - file_name="test.txt", - content_type="text/plain", +def test_build_file_processors_text_files(): + """Test that build_file_processors includes text file parsers.""" + file_processors = build_file_processors( azure_credential=MockAzureCredential(), document_intelligence_service=None, ) - assert isinstance(parser, TextParser) + assert ".txt" in file_processors + assert isinstance(file_processors[".txt"].parser, TextParser) + assert ".md" in file_processors + assert isinstance(file_processors[".md"].parser, TextParser) -def test_setup_parser_application_type_with_di(): - """Test that select_parser uses DI for application/* content types.""" - parser = select_parser( - file_name="test.unknown", - content_type="application/unknown", +def test_build_file_processors_with_di_enables_office_formats(): + """Test that build_file_processors includes Office formats when DI is available.""" + file_processors = build_file_processors( azure_credential=MockAzureCredential(), document_intelligence_service="myservice", ) - assert isinstance(parser, DocumentAnalysisParser) + assert ".docx" in file_processors + assert ".pptx" in file_processors + assert ".xlsx" in file_processors + assert isinstance(file_processors[".docx"].parser, DocumentAnalysisParser) -def test_setup_parser_unsupported_file_type(): - """Test that select_parser raises ValueError for unsupported file types.""" - with pytest.raises(ValueError, match="Unsupported file type"): - select_parser( - file_name="test.xyz", - content_type="application/xyz", - azure_credential=MockAzureCredential(), - document_intelligence_service=None, - ) +def test_build_file_processors_without_di_excludes_office_formats(): + """Test that build_file_processors excludes Office formats when DI is not available.""" + file_processors = build_file_processors( + azure_credential=MockAzureCredential(), + document_intelligence_service=None, + ) + + assert ".docx" not in file_processors + assert ".pptx" not in file_processors + assert ".xlsx" not in file_processors From 46bbaf767d4c3b5a71e6dbfdc1edae8262f524ad Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 11 Nov 2025 23:09:09 -0800 Subject: [PATCH 23/30] Cleanup function test --- tests/test_function_apps.py | 170 +++++++++++++++--------------------- 1 file changed, 70 insertions(+), 100 deletions(-) diff --git a/tests/test_function_apps.py b/tests/test_function_apps.py index 3abac2ef9b..7b0eda47ef 100644 --- a/tests/test_function_apps.py +++ b/tests/test_function_apps.py @@ -1,6 +1,6 @@ import base64 -import importlib import json +import logging from collections.abc import Iterable from dataclasses import dataclass, field from typing import Any @@ -10,6 +10,9 @@ from document_extractor import function_app as document_extractor from figure_processor import function_app as figure_processor +from prepdocslib.fileprocessor import FileProcessor +from prepdocslib.textparser import TextParser +from prepdocslib.textsplitter import SentenceTextSplitter from tests.mocks import TEST_PNG_BYTES from text_processor import function_app as text_processor @@ -75,8 +78,6 @@ async def parse(self, content: Any): page = document_extractor.Page(page_num=0, offset=0, text=page_text, images=[figure]) # Set up mock file processors and settings - from prepdocslib.fileprocessor import FileProcessor - mock_file_processors = { ".pdf": FileProcessor(StubParser([page]), None), } @@ -123,8 +124,6 @@ async def parse(self, content: Any): @pytest.mark.asyncio async def test_document_extractor_requires_single_record(monkeypatch: pytest.MonkeyPatch) -> None: - from prepdocslib.fileprocessor import FileProcessor - mock_settings = document_extractor.GlobalSettings( file_processors={".pdf": FileProcessor(None, None)}, azure_credential=object(), @@ -138,8 +137,6 @@ async def test_document_extractor_requires_single_record(monkeypatch: pytest.Mon @pytest.mark.asyncio async def test_document_extractor_handles_processing_exception(monkeypatch: pytest.MonkeyPatch) -> None: - from prepdocslib.fileprocessor import FileProcessor - async def failing_process(data: dict[str, Any]) -> dict[str, Any]: raise RuntimeError("boom") @@ -179,8 +176,6 @@ async def test_document_extractor_invalid_json_returns_error() -> None: @pytest.mark.asyncio async def test_document_extractor_process_document_http_error(monkeypatch: pytest.MonkeyPatch) -> None: - from prepdocslib.fileprocessor import FileProcessor - class FailingParser: async def parse(self, content): raise document_extractor.HttpResponseError(message="fail") @@ -215,11 +210,10 @@ def test_document_extractor_missing_file_data() -> None: def test_document_extractor_managed_identity_reload(monkeypatch: pytest.MonkeyPatch) -> None: monkeypatch.setenv("AZURE_CLIENT_ID", "client-123") - module = importlib.reload(document_extractor) - module.configure_global_settings() - assert isinstance(module.settings.azure_credential, module.ManagedIdentityCredential) + document_extractor.configure_global_settings() + assert isinstance(document_extractor.settings.azure_credential, document_extractor.ManagedIdentityCredential) monkeypatch.delenv("AZURE_CLIENT_ID", raising=False) - importlib.reload(document_extractor) + document_extractor.configure_global_settings() @pytest.mark.asyncio @@ -297,64 +291,82 @@ def test_figure_processor_initialisation_with_env(monkeypatch: pytest.MonkeyPatc monkeypatch.setenv("AZURE_OPENAI_CHATGPT_DEPLOYMENT", "deploy") monkeypatch.setenv("AZURE_VISION_ENDPOINT", "https://vision") - import sys - from pathlib import Path - - fp_root = Path(__file__).parent.parent / "app" / "functions" / "figure_processor" - sys.path.insert(0, str(fp_root)) - - fp_servicesetup = importlib.import_module("prepdocslib.servicesetup") - fp_embeddings = importlib.import_module("prepdocslib.embeddings") + call_state: dict[str, Any] = {} + + class StubCredential: + def __init__(self, client_id: str | None = None): + call_state["credential_client_id"] = client_id + + def fake_setup_blob_manager(**kwargs: Any) -> str: + call_state["blob_manager_kwargs"] = kwargs + return "blob" + + def fake_setup_figure_processor(**kwargs: Any) -> str: + call_state["figure_processor_kwargs"] = kwargs + return "figproc" + + def fake_setup_openai_client( + *, + openai_host: Any, + azure_credential: Any, + azure_openai_service: str | None, + azure_openai_custom_url: str | None, + ) -> tuple[str, None]: + call_state["openai_client_args"] = { + "openai_host": openai_host, + "azure_credential": azure_credential, + "azure_openai_service": azure_openai_service, + "azure_openai_custom_url": azure_openai_custom_url, + } + return ("openai-client", None) - monkeypatch.setattr(fp_servicesetup, "setup_blob_manager", lambda **_: "blob") - monkeypatch.setattr(fp_servicesetup, "setup_figure_processor", lambda **_: "figproc") - monkeypatch.setattr(fp_servicesetup, "setup_openai_client", lambda **_: ("openai-client", None)) + def fake_get_bearer_token_provider(credential: Any, scope: str): + call_state["token_scope"] = scope + call_state["token_credential"] = credential + return lambda: "token" class DummyImageEmbeddings: def __init__(self, endpoint: str, token_provider): self.endpoint = endpoint self.token_provider = token_provider - monkeypatch.setattr(fp_embeddings, "ImageEmbeddings", DummyImageEmbeddings) - monkeypatch.setattr("azure.identity.aio.get_bearer_token_provider", lambda *_, **__: lambda: "token") + monkeypatch.setattr(figure_processor, "ManagedIdentityCredential", StubCredential) + monkeypatch.setattr(figure_processor, "setup_blob_manager", fake_setup_blob_manager) + monkeypatch.setattr(figure_processor, "setup_figure_processor", fake_setup_figure_processor) + monkeypatch.setattr(figure_processor, "setup_openai_client", fake_setup_openai_client) + monkeypatch.setattr(figure_processor, "get_bearer_token_provider", fake_get_bearer_token_provider) + monkeypatch.setattr(figure_processor, "ImageEmbeddings", DummyImageEmbeddings) + monkeypatch.setattr(figure_processor, "settings", None) - module = importlib.reload(figure_processor) - module.configure_global_settings() + figure_processor.configure_global_settings() - assert module.settings.blob_manager == "blob" - assert module.settings.figure_processor == "figproc" - assert isinstance(module.settings.image_embeddings, DummyImageEmbeddings) + assert figure_processor.settings is not None + assert figure_processor.settings.blob_manager == "blob" + assert figure_processor.settings.figure_processor == "figproc" + embeddings = figure_processor.settings.image_embeddings + assert isinstance(embeddings, DummyImageEmbeddings) + assert embeddings.endpoint == "https://vision" + assert embeddings.token_provider() == "token" - # Reset module to default configuration for subsequent tests - for var in [ - "AZURE_CLIENT_ID", - "AZURE_STORAGE_ACCOUNT", - "AZURE_IMAGESTORAGE_CONTAINER", - "USE_MULTIMODAL", - "AZURE_OPENAI_SERVICE", - "AZURE_OPENAI_CHATGPT_DEPLOYMENT", - "AZURE_VISION_ENDPOINT", - ]: - monkeypatch.delenv(var, raising=False) - sys.path.remove(str(fp_root)) - importlib.reload(figure_processor) + assert call_state["credential_client_id"] == "client-456" + assert call_state["blob_manager_kwargs"]["storage_account"] == "acct" + assert call_state["figure_processor_kwargs"]["use_multimodal"] is True + assert call_state["token_scope"] == "https://cognitiveservices.azure.com/.default" + assert isinstance(call_state["token_credential"], StubCredential) + assert call_state["openai_client_args"]["azure_openai_service"] == "svc" + assert call_state["openai_client_args"]["azure_credential"] is call_state["token_credential"] -def test_figure_processor_warns_when_openai_incomplete(monkeypatch: pytest.MonkeyPatch) -> None: +def test_figure_processor_warns_when_openai_incomplete(monkeypatch: pytest.MonkeyPatch, caplog) -> None: """Figure processor is created with warning when USE_MULTIMODAL is true but OpenAI config is incomplete.""" monkeypatch.setenv("USE_MULTIMODAL", "true") monkeypatch.setenv("AZURE_STORAGE_ACCOUNT", "acct") monkeypatch.setenv("AZURE_IMAGESTORAGE_CONTAINER", "images") # OpenAI config missing, so figure_processor will be created but won't work properly - module = importlib.reload(figure_processor) - module.configure_global_settings() + figure_processor.configure_global_settings() # A FigureProcessor object is created even with incomplete config - assert module.settings.figure_processor is not None - # But it will raise ValueError when trying to describe images due to missing OpenAI client - monkeypatch.delenv("USE_MULTIMODAL", raising=False) - monkeypatch.delenv("AZURE_STORAGE_ACCOUNT", raising=False) - monkeypatch.delenv("AZURE_IMAGESTORAGE_CONTAINER", raising=False) - importlib.reload(figure_processor) + assert figure_processor.settings.figure_processor is not None + assert "USE_MULTIMODAL is true but Azure OpenAI configuration incomplete" in caplog.text @pytest.mark.asyncio @@ -371,9 +383,6 @@ async def create_embeddings(self, texts: list[str]) -> list[list[float]]: return [[0.41, 0.42, 0.43] for _ in texts] # Set up mock file processors with stub splitter - from prepdocslib.fileprocessor import FileProcessor - from prepdocslib.textparser import TextParser - mock_file_processors = { ".pdf": FileProcessor(TextParser(), StubSplitter()), } @@ -562,21 +571,12 @@ async def test_text_processor_invalid_json(monkeypatch: pytest.MonkeyPatch) -> N @pytest.mark.asyncio async def test_text_processor_with_client_id(monkeypatch: pytest.MonkeyPatch) -> None: """Test text processor uses ManagedIdentityCredential with client ID.""" - import os - # Set the AZURE_CLIENT_ID environment variable - original_client_id = os.environ.get("AZURE_CLIENT_ID") - os.environ["AZURE_CLIENT_ID"] = "test-client-id" - - try: - # Force reimport to trigger module initialization with the env var set - importlib.reload(text_processor) - finally: - # Restore original value - if original_client_id: - os.environ["AZURE_CLIENT_ID"] = original_client_id - else: - os.environ.pop("AZURE_CLIENT_ID", None) + monkeypatch.setenv("AZURE_CLIENT_ID", "test-client-id") + text_processor.configure_global_settings() + # Verify it was configured (actual verification would check the credential type) + monkeypatch.delenv("AZURE_CLIENT_ID", raising=False) + text_processor.configure_global_settings() @pytest.mark.asyncio @@ -589,10 +589,6 @@ async def test_text_processor_embeddings_setup(monkeypatch: pytest.MonkeyPatch) @pytest.mark.asyncio async def test_text_processor_no_sections(monkeypatch: pytest.MonkeyPatch) -> None: """Test text processor handles empty sections.""" - from prepdocslib.fileprocessor import FileProcessor - from prepdocslib.textparser import TextParser - from prepdocslib.textsplitter import SentenceTextSplitter - mock_file_processors = { ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), } @@ -640,12 +636,6 @@ def mock_process_text(pages, file, splitter, category): @pytest.mark.asyncio async def test_text_processor_embeddings_not_initialized(monkeypatch: pytest.MonkeyPatch, caplog) -> None: """Test text processor logs warning when embeddings requested but not initialized.""" - import logging - - from prepdocslib.fileprocessor import FileProcessor - from prepdocslib.textparser import TextParser - from prepdocslib.textsplitter import SentenceTextSplitter - mock_file_processors = { ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), } @@ -690,10 +680,6 @@ def mock_process_text(pages, file, splitter, category): @pytest.mark.asyncio async def test_text_processor_empty_chunk_skipped(monkeypatch: pytest.MonkeyPatch) -> None: """Test text processor skips empty chunks.""" - from prepdocslib.fileprocessor import FileProcessor - from prepdocslib.textparser import TextParser - from prepdocslib.textsplitter import SentenceTextSplitter - mock_file_processors = { ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), } @@ -744,10 +730,6 @@ def mock_process_text(pages, file, splitter, category): @pytest.mark.asyncio async def test_text_processor_with_multimodal_embeddings(monkeypatch: pytest.MonkeyPatch) -> None: """Test text processor includes image embeddings when use_multimodal is true.""" - from prepdocslib.fileprocessor import FileProcessor - from prepdocslib.textparser import TextParser - from prepdocslib.textsplitter import SentenceTextSplitter - mock_file_processors = { ".pdf": FileProcessor(TextParser(), SentenceTextSplitter()), } @@ -810,12 +792,6 @@ def mock_process_text(pages, file, splitter, category): @pytest.mark.asyncio async def test_text_processor_embedding_dimension_mismatch(monkeypatch: pytest.MonkeyPatch, caplog) -> None: """Test text processor logs warning when embedding dimensions don't match.""" - import logging - - from prepdocslib.fileprocessor import FileProcessor - from prepdocslib.textparser import TextParser - from prepdocslib.textsplitter import SentenceTextSplitter - mock_embedding_service = type("MockEmbeddingService", (), {})() async def mock_create_embeddings(texts): @@ -867,12 +843,6 @@ def mock_process_text(pages, file, splitter, category): @pytest.mark.asyncio async def test_text_processor_embeddings_missing_warning(monkeypatch: pytest.MonkeyPatch, caplog) -> None: """Test text processor logs warning when embeddings are requested but missing.""" - import logging - - from prepdocslib.fileprocessor import FileProcessor - from prepdocslib.textparser import TextParser - from prepdocslib.textsplitter import SentenceTextSplitter - mock_embedding_service = type("MockEmbeddingService", (), {})() async def mock_create_embeddings(texts): From 2d7b45312ccd8197655fb8becdaa417bccba3923 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Tue, 11 Nov 2025 23:43:09 -0800 Subject: [PATCH 24/30] 100% diff coverage --- app/backend/prepdocslib/filestrategy.py | 2 - app/backend/prepdocslib/servicesetup.py | 4 +- tests/test_function_apps.py | 192 ++++++++++++++++++++++-- tests/test_servicesetup.py | 34 +++++ 4 files changed, 219 insertions(+), 13 deletions(-) diff --git a/app/backend/prepdocslib/filestrategy.py b/app/backend/prepdocslib/filestrategy.py index 3eb8594f99..66022d9178 100644 --- a/app/backend/prepdocslib/filestrategy.py +++ b/app/backend/prepdocslib/filestrategy.py @@ -35,8 +35,6 @@ async def parse_file( return [] logger.info("Ingesting '%s'", file.filename()) pages = [page async for page in processor.parser.parse(content=file.content)] - total_images = sum(len(page.images) for page in pages) - logger.info("Found %d images across %d pages", total_images, len(pages)) for page in pages: for image in page.images: logger.info("Processing image '%s' on page %d", image.filename, page.page_num) diff --git a/app/backend/prepdocslib/servicesetup.py b/app/backend/prepdocslib/servicesetup.py index fce4ad7d21..7b3bd45c65 100644 --- a/app/backend/prepdocslib/servicesetup.py +++ b/app/backend/prepdocslib/servicesetup.py @@ -273,7 +273,7 @@ def build_file_processors( pdf_parser: Optional[Parser] = None if use_local_pdf_parser or document_intelligence_service is None: pdf_parser = LocalPdfParser() - elif document_intelligence_service is not None: + elif doc_int_parser is not None: pdf_parser = doc_int_parser else: logger.warning("No PDF parser available") @@ -281,7 +281,7 @@ def build_file_processors( html_parser: Optional[Parser] = None if use_local_html_parser or document_intelligence_service is None: html_parser = LocalHTMLParser() - elif document_intelligence_service is not None: + elif doc_int_parser is not None: html_parser = doc_int_parser else: logger.warning("No HTML parser available") diff --git a/tests/test_function_apps.py b/tests/test_function_apps.py index 7b0eda47ef..f2183acb8d 100644 --- a/tests/test_function_apps.py +++ b/tests/test_function_apps.py @@ -1,6 +1,7 @@ import base64 import json import logging +import os from collections.abc import Iterable from dataclasses import dataclass, field from typing import Any @@ -485,12 +486,32 @@ async def test_document_extractor_without_settings(monkeypatch: pytest.MonkeyPat assert body["error"] == "Settings not initialized" -@pytest.mark.asyncio -async def test_document_extractor_module_init_key_error(monkeypatch: pytest.MonkeyPatch) -> None: - """Test document extractor handles KeyError during module initialization.""" - # This tests lines 248-249 in document_extractor/function_app.py - # The module-level initialization code catches KeyError and logs a warning - pass # This is tested by ensuring the module can load even if env vars are missing +def test_document_extractor_module_init_key_error( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + """Reload module without pytest env to trigger init warning path.""" + import importlib + from unittest import mock + + saved_env = os.environ.get("PYTEST_CURRENT_TEST") + monkeypatch.delenv("PYTEST_CURRENT_TEST", raising=False) + + caplog.set_level("WARNING") + + with mock.patch("azure.identity.aio.ManagedIdentityCredential", lambda *_, **__: object()), mock.patch( + "prepdocslib.servicesetup.build_file_processors", side_effect=KeyError("missing env") + ): + reloaded = importlib.reload(document_extractor) + + assert "Could not initialize settings at module load time" in caplog.text + + monkeypatch.setenv("PYTEST_CURRENT_TEST", "pytest") + + if saved_env is not None: + monkeypatch.setenv("PYTEST_CURRENT_TEST", saved_env) + + importlib.reload(reloaded) + reloaded.settings = None @pytest.mark.asyncio @@ -581,9 +602,62 @@ async def test_text_processor_with_client_id(monkeypatch: pytest.MonkeyPatch) -> @pytest.mark.asyncio async def test_text_processor_embeddings_setup(monkeypatch: pytest.MonkeyPatch) -> None: - """Test text processor sets up embeddings when use_vectors is true.""" - # This tests lines 75-76, 82 in text_processor/function_app.py - pass # This is tested by the existing comprehensive text processor tests + """configure_global_settings wires up embedding service when configuration is complete.""" + + monkeypatch.setenv("USE_VECTORS", "true") + monkeypatch.setenv("AZURE_OPENAI_SERVICE", "svc") + monkeypatch.setenv("AZURE_OPENAI_EMB_DEPLOYMENT", "deployment") + monkeypatch.setenv("AZURE_OPENAI_EMB_MODEL_NAME", "model") + monkeypatch.setenv("OPENAI_HOST", "azure") + + class StubCredential: + def __init__(self, *args, **kwargs) -> None: + pass + + monkeypatch.setattr(text_processor, "ManagedIdentityCredential", StubCredential) + monkeypatch.setattr(text_processor, "build_file_processors", lambda **kwargs: {".pdf": object()}) + + calls: dict[str, object] = {} + + def fake_setup_openai_client(**kwargs): + calls["openai_host"] = kwargs["openai_host"] + return object(), "https://svc.openai.azure.com" + + def fake_setup_embeddings_service(openai_host, openai_client, **kwargs): + calls["embedding"] = kwargs + return "embedding-service" + + monkeypatch.setattr(text_processor, "setup_openai_client", fake_setup_openai_client) + monkeypatch.setattr(text_processor, "setup_embeddings_service", fake_setup_embeddings_service) + + text_processor.settings = None + text_processor.configure_global_settings() + + assert calls["openai_host"] == text_processor.OpenAIHost.AZURE + assert calls["embedding"]["emb_model_name"] == "model" + assert text_processor.settings is not None + assert text_processor.settings.embedding_service == "embedding-service" + + text_processor.settings = None + + +def test_text_processor_configure_logs_when_embedding_config_missing( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + monkeypatch.setenv("USE_VECTORS", "true") + monkeypatch.setattr(text_processor, "ManagedIdentityCredential", lambda *args, **kwargs: object()) + monkeypatch.setattr(text_processor, "build_file_processors", lambda **kwargs: {".pdf": object()}) + + text_processor.settings = None + + with caplog.at_level(logging.WARNING): + text_processor.configure_global_settings() + + assert "embedding configuration incomplete" in caplog.text + assert text_processor.settings is not None + assert text_processor.settings.embedding_service is None + + text_processor.settings = None @pytest.mark.asyncio @@ -891,3 +965,103 @@ def mock_process_text(pages, file, splitter, category): assert response.status_code == 200 assert "were requested but missing" in caplog.text + + +@pytest.mark.asyncio +async def test_text_processor_process_document_handles_missing_figures( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + stub_processor = FileProcessor(TextParser(), SentenceTextSplitter()) + + monkeypatch.setattr(text_processor, "select_processor_for_filename", lambda *_args, **_kwargs: stub_processor) + monkeypatch.setattr( + text_processor, + "process_text", + lambda *args, **kwargs: [SectionStub(chunk=ChunkStub(page_num=0, text="Chunk", images=[]))], + ) + + text_processor.settings = text_processor.GlobalSettings( + use_vectors=False, + use_multimodal=False, + embedding_dimensions=1536, + file_processors={".pdf": stub_processor}, + embedding_service=None, + ) + + payload = { + "consolidated_document": { + "file_name": "sample.pdf", + "pages": [ + { + "page_num": 0, + "text": "Hello", + "figure_ids": ["missing", "bad"], + } + ], + "figures": [ + { + "figure_id": "bad", + # Missing filename forces ImageOnPage.from_skill_payload to raise AssertionError + } + ], + } + } + + with caplog.at_level(logging.WARNING): + chunks = await text_processor.process_document(payload) + + assert chunks + assert any("not found in figures metadata" in record.message for record in caplog.records) + assert any("Failed to deserialize figure" in record.message for record in caplog.records) + + text_processor.settings = None + + +@pytest.mark.asyncio +async def test_text_processor_process_document_returns_empty_when_no_pages(monkeypatch: pytest.MonkeyPatch) -> None: + text_processor.settings = text_processor.GlobalSettings( + use_vectors=False, + use_multimodal=False, + embedding_dimensions=1536, + file_processors={}, + embedding_service=None, + ) + + result = await text_processor.process_document({"consolidated_document": {"file_name": "empty.pdf", "pages": []}}) + + assert result == [] + + text_processor.settings = None + + +def test_text_processor_module_init_logs_warning( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + import importlib + from unittest import mock + + saved_env = os.environ.get("PYTEST_CURRENT_TEST") + monkeypatch.delenv("PYTEST_CURRENT_TEST", raising=False) + + class StubCredential: + def __init__(self, *args, **kwargs) -> None: + pass + + caplog.set_level("WARNING") + + with mock.patch("azure.identity.aio.ManagedIdentityCredential", StubCredential), mock.patch( + "prepdocslib.servicesetup.build_file_processors", side_effect=KeyError("missing env") + ), mock.patch("prepdocslib.servicesetup.setup_openai_client", return_value=(object(), None)), mock.patch( + "prepdocslib.servicesetup.setup_embeddings_service", return_value=None + ): + reloaded = importlib.reload(text_processor) + + assert "Could not initialize settings at module load time" in caplog.text + + monkeypatch.setenv("PYTEST_CURRENT_TEST", "pytest") + + if saved_env is not None: + monkeypatch.setenv("PYTEST_CURRENT_TEST", saved_env) + + importlib.reload(reloaded) + reloaded.settings = None diff --git a/tests/test_servicesetup.py b/tests/test_servicesetup.py index 53b5708c1b..0334042034 100644 --- a/tests/test_servicesetup.py +++ b/tests/test_servicesetup.py @@ -4,10 +4,13 @@ from prepdocslib.embeddings import OpenAIEmbeddings from prepdocslib.figureprocessor import FigureProcessor, MediaDescriptionStrategy +from prepdocslib.fileprocessor import FileProcessor from prepdocslib.pdfparser import DocumentAnalysisParser from prepdocslib.servicesetup import ( OpenAIHost, build_file_processors, + clean_key_if_exists, + select_processor_for_filename, setup_blob_manager, setup_embeddings_service, setup_figure_processor, @@ -317,3 +320,34 @@ def test_build_file_processors_without_di_excludes_office_formats(): assert ".docx" not in file_processors assert ".pptx" not in file_processors assert ".xlsx" not in file_processors + + +def test_clean_key_if_exists_handles_whitespace() -> None: + assert clean_key_if_exists(" secret ") == "secret" + assert clean_key_if_exists(" ") is None + assert clean_key_if_exists(None) is None + + +def test_build_file_processors_logs_when_no_parsers( + monkeypatch: pytest.MonkeyPatch, caplog: pytest.LogCaptureFixture +) -> None: + caplog.set_level("WARNING") + monkeypatch.setattr("prepdocslib.servicesetup.DocumentAnalysisParser", lambda *args, **kwargs: None) + + processors = build_file_processors( + azure_credential=MockAzureCredential(), + document_intelligence_service="service", + use_local_pdf_parser=False, + use_local_html_parser=False, + ) + + assert ".pdf" not in processors + assert ".html" not in processors + warnings = {record.message for record in caplog.records} + assert any("No PDF parser available" in message for message in warnings) + assert any("No HTML parser available" in message for message in warnings) + + +def test_select_processor_for_filename_raises_when_unknown() -> None: + with pytest.raises(ValueError, match="Unsupported file type: file.unsupported"): + select_processor_for_filename("file.unsupported", {".txt": FileProcessor(TextParser(), None)}) From c5116c82727abc85d329154c39433f7320a50d35 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Wed, 12 Nov 2025 08:54:32 -0800 Subject: [PATCH 25/30] Update app/functions/document_extractor/function_app.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- app/functions/document_extractor/function_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/functions/document_extractor/function_app.py b/app/functions/document_extractor/function_app.py index 83e0bd3f85..92077bc6d4 100644 --- a/app/functions/document_extractor/function_app.py +++ b/app/functions/document_extractor/function_app.py @@ -167,7 +167,7 @@ async def extract_document(req: func.HttpRequest) -> func.HttpResponse: return func.HttpResponse(json.dumps({"values": output_values}), mimetype="application/json", status_code=200) except Exception as e: - logger.error(f"Fatal error in extract_document: {str(e)}", exc_info=True) + logger.error("Fatal error in extract_document: %s", str(e), exc_info=True) return func.HttpResponse(json.dumps({"error": str(e)}), mimetype="application/json", status_code=500) From cfa762c2bd55a0866d712b4e3e9fce0fa9d2d3cb Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Wed, 12 Nov 2025 08:54:54 -0800 Subject: [PATCH 26/30] Update app/backend/prepdocslib/page.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- app/backend/prepdocslib/page.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index 41cfb0cc2c..ed59a9bd7f 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -64,8 +64,10 @@ def from_skill_payload(cls, data: dict[str, Any]) -> tuple["ImageOnPage", str]: filename = data.get("filename") figure_id = data.get("figure_id") placeholder = data.get("placeholder") - assert filename is not None, "filename is required" - assert figure_id is not None, "figure_id is required" + if filename is None: + raise ValueError("filename is required") + if figure_id is None: + raise ValueError("figure_id is required") # Generate placeholder if not provided if placeholder is None: From 7c25851338416a47b05c24139cf3162da0913a14 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Wed, 12 Nov 2025 08:57:09 -0800 Subject: [PATCH 27/30] Update app/functions/document_extractor/function_app.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- app/functions/document_extractor/function_app.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/app/functions/document_extractor/function_app.py b/app/functions/document_extractor/function_app.py index 92077bc6d4..22f269b2a1 100644 --- a/app/functions/document_extractor/function_app.py +++ b/app/functions/document_extractor/function_app.py @@ -154,7 +154,7 @@ async def extract_document(req: func.HttpRequest) -> func.HttpResponse: } ] except Exception as e: - logger.error(f"Error processing record {record_id}: {str(e)}", exc_info=True) + logger.error("Error processing record %s: %s", record_id, str(e), exc_info=True) output_values = [ { "recordId": record_id, From e9f13f57a2c108da278864e038a15448ebe4c8bc Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Wed, 12 Nov 2025 09:23:31 -0800 Subject: [PATCH 28/30] Address feedback and tweak docs --- .github/prompts/testcov.prompt.md | 27 -------------- README.md | 2 +- azure.yaml | 60 +++++++++++++++---------------- docs/data_ingestion.md | 8 ++--- docs/deploy_features.md | 4 +-- tests/test_mediadescriber.py | 2 +- 6 files changed, 38 insertions(+), 65 deletions(-) delete mode 100644 .github/prompts/testcov.prompt.md diff --git a/.github/prompts/testcov.prompt.md b/.github/prompts/testcov.prompt.md deleted file mode 100644 index 76a318deb9..0000000000 --- a/.github/prompts/testcov.prompt.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -agent: agent ---- - -The goal is for the tests to cover all lines of code. - -Generate a coverage report with: - -pytest --cov --cov-report=annotate:cov_annotate - -If you are checking for coverage of a specific module, you can specify it like this: - -pytest --cov=your_module_name --cov-report=annotate:cov_annotate - -You can also specify specific tests to run, for example: - -pytest tests/test_your_module.py --cov=your_module_name --cov-report=annotate:cov_annotate - -Open the cov_annotate directory to view the annotated source code. -There will be one file per source file. If a file has 100% source coverage, it means all lines are covered by tests, so you do not need to open the file. - -For each file that has less than 100% test coverage, find the matching file in cov_annotate and review the file. - -If a line starts with a ! (exclamation mark), it means that the line is not covered by tests. -Add tests to cover the missing lines. - -Keep running the tests and improving coverage until all lines are covered. diff --git a/README.md b/README.md index 181573b13b..23934a3cbb 100644 --- a/README.md +++ b/README.md @@ -60,7 +60,7 @@ The repo includes sample data so it's ready to try end to end. In this sample ap - Chat (multi-turn) and Q&A (single turn) interfaces - Renders citations and thought process for each answer - Includes settings directly in the UI to tweak the behavior and experiment with options -- Integrates Azure AI Search for indexing and retrieval of documents, with support for [many document formats](/docs/data_ingestion.md#supported-document-formats) as well as [cloud-based data ingestion](/docs/data_ingestion.md#cloud-based-ingestion) +- Integrates Azure AI Search for indexing and retrieval of documents, with support for [many document formats](/docs/data_ingestion.md#supported-document-formats) as well as [cloud data ingestion](/docs/data_ingestion.md#cloud-data-ingestion) - Optional usage of [multimodal models](/docs/multimodal.md) to reason over image-heavy documents - Optional addition of [speech input/output](/docs/deploy_features.md#enabling-speech-inputoutput) for accessibility - Optional automation of [user login and data access](/docs/login_and_acl.md) via Microsoft Entra diff --git a/azure.yaml b/azure.yaml index afb975de9d..c8771e6d3a 100644 --- a/azure.yaml +++ b/azure.yaml @@ -41,36 +41,36 @@ services: interactive: false continueOnError: false # Un-comment this section if using USE_CLOUD_INGESTION option - # document-extractor: - # project: ./app/functions/document_extractor - # language: py - # host: function - # hooks: - # prepackage: - # shell: pwsh - # run: python ../../../scripts/copy_prepdocslib.py - # interactive: false - # continueOnError: false - # figure-processor: - # project: ./app/functions/figure_processor - # language: py - # host: function - # hooks: - # prepackage: - # shell: pwsh - # run: python ../../../scripts/copy_prepdocslib.py - # interactive: false - # continueOnError: false - # text-processor: - # project: ./app/functions/text_processor - # language: py - # host: function - # hooks: - # prepackage: - # shell: pwsh - # run: python ../../../scripts/copy_prepdocslib.py - # interactive: false - # continueOnError: false + document-extractor: + project: ./app/functions/document_extractor + language: py + host: function + hooks: + prepackage: + shell: pwsh + run: python ../../../scripts/copy_prepdocslib.py + interactive: false + continueOnError: false + figure-processor: + project: ./app/functions/figure_processor + language: py + host: function + hooks: + prepackage: + shell: pwsh + run: python ../../../scripts/copy_prepdocslib.py + interactive: false + continueOnError: false + text-processor: + project: ./app/functions/text_processor + language: py + host: function + hooks: + prepackage: + shell: pwsh + run: python ../../../scripts/copy_prepdocslib.py + interactive: false + continueOnError: false hooks: preprovision: windows: diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md index 329b82934f..7c74e859ec 100644 --- a/docs/data_ingestion.md +++ b/docs/data_ingestion.md @@ -2,7 +2,7 @@ The [azure-search-openai-demo](/) project can set up a full RAG chat app on Azure AI Search and OpenAI so that you can chat on custom data, like internal enterprise data or domain-specific knowledge sets. For full instructions on setting up the project, consult the [main README](/README.md), and then return here for detailed instructions on the data ingestion component. -The chat app provides two ways to ingest data: manual ingestion and cloud-based ingestion. Both approaches use the same code for processing the data, but the manual ingestion runs locally while cloud ingestion runs in Azure Functions as Azure AI Search custom skills. +The chat app provides two ways to ingest data: manual ingestion and cloud ingestion. Both approaches use the same code for processing the data, but the manual ingestion runs locally while cloud ingestion runs in Azure Functions as Azure AI Search custom skills. - [Supported document formats](#supported-document-formats) - [Ingestion stages](#ingestion-stages) @@ -13,7 +13,7 @@ The chat app provides two ways to ingest data: manual ingestion and cloud-based - [Categorizing data for enhanced search](#enhancing-search-functionality-with-data-categorization) - [Indexing additional documents](#indexing-additional-documents) - [Removing documents](#removing-documents) -- [Cloud-based ingestion](#cloud-based-ingestion) +- [Cloud ingestion](#cloud-ingestion) - [Custom skills pipeline](#custom-skills-pipeline) - [Indexing of additional documents](#indexing-of-additional-documents) - [Removal of documents](#removal-of-documents) @@ -36,7 +36,7 @@ In order to ingest a document format, we need a tool that can turn it into text. ## Ingestion stages -The ingestion pipeline consists of three main stages that transform raw documents into searchable content in Azure AI Search. These stages apply to both [local ingestion](#local-ingestion) and [cloud-based ingestion](#cloud-based-ingestion). +The ingestion pipeline consists of three main stages that transform raw documents into searchable content in Azure AI Search. These stages apply to both [local ingestion](#local-ingestion) and [cloud ingestion](#cloud-ingestion). ### Document extraction @@ -132,7 +132,7 @@ To remove all documents, use `./scripts/prepdocs.sh --removeall` or `./scripts/p You can also remove individual documents by using the `--remove` flag. Open either `scripts/prepdocs.sh` or `scripts/prepdocs.ps1` and replace `/data/*` with `/data/YOUR-DOCUMENT-FILENAME-GOES-HERE.pdf`. Then run `scripts/prepdocs.sh --remove` or `scripts/prepdocs.ps1 --remove`. -## Cloud-based ingestion +## Cloud ingestion This project includes an optional feature to perform data ingestion in the cloud using Azure Functions as custom skills for Azure AI Search indexers. This approach offloads the ingestion workload from your local machine to the cloud, allowing for more scalable and efficient processing of large datasets. diff --git a/docs/deploy_features.md b/docs/deploy_features.md index 0c63ca9b40..af64f9e6fd 100644 --- a/docs/deploy_features.md +++ b/docs/deploy_features.md @@ -322,9 +322,9 @@ Alternatively you can use the browser's built-in [Speech Synthesis API](https:// azd env set USE_SPEECH_OUTPUT_BROWSER true ``` -## Enabling cloud-based data ingestion +## Enabling cloud data ingestion -By default, this project runs a local script in order to ingest data. Once you move beyond the sample documents, you may want cloud-based ingestion, which uses Azure AI Search indexers and custom Azure AI Search skills based off the same code used by the local ingestion. That approach scales better to larger amounts of data. +By default, this project runs a local script in order to ingest data. Once you move beyond the sample documents, you may want cloud ingestion, which uses Azure AI Search indexers and custom Azure AI Search skills based off the same code used by the local ingestion. That approach scales better to larger amounts of data. To enable cloud ingestion: diff --git a/tests/test_mediadescriber.py b/tests/test_mediadescriber.py index 6822e28468..2f767f712e 100644 --- a/tests/test_mediadescriber.py +++ b/tests/test_mediadescriber.py @@ -68,7 +68,7 @@ def mock_get(self, url, **kwargs): "startPageNumber": 1, "endPageNumber": 1, "unit": "pixel", - "pages": [{"pageNumber": 0}], + "pages": [{"pageNumber": 1}], } ], }, From b96f9c1f1caa34090c50b4699ff76f3d17d89e76 Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Wed, 12 Nov 2025 10:00:17 -0800 Subject: [PATCH 29/30] Apply suggestions from code review Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- app/backend/prepdocslib/blobmanager.py | 3 ++- app/backend/prepdocslib/page.py | 2 +- tests/test_searchmanager.py | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/app/backend/prepdocslib/blobmanager.py b/app/backend/prepdocslib/blobmanager.py index f682ec5029..e02695b8a8 100644 --- a/app/backend/prepdocslib/blobmanager.py +++ b/app/backend/prepdocslib/blobmanager.py @@ -432,7 +432,8 @@ async def upload_blob(self, file: File) -> str: blob_client = await container_client.upload_blob(blob_name, reopened_file, overwrite=True) file.url = blob_client.url - assert file.url is not None, "file.url must be set after upload" + if file.url is None: + raise ValueError("file.url must be set after upload") return unquote(file.url) async def upload_document_image( diff --git a/app/backend/prepdocslib/page.py b/app/backend/prepdocslib/page.py index ed59a9bd7f..cece557cc7 100644 --- a/app/backend/prepdocslib/page.py +++ b/app/backend/prepdocslib/page.py @@ -67,7 +67,7 @@ def from_skill_payload(cls, data: dict[str, Any]) -> tuple["ImageOnPage", str]: if filename is None: raise ValueError("filename is required") if figure_id is None: - raise ValueError("figure_id is required") + raise ValueError("figure_id is required for ImageOnPage deserialization") # Generate placeholder if not provided if placeholder is None: diff --git a/tests/test_searchmanager.py b/tests/test_searchmanager.py index a730f469f7..47a54d4905 100644 --- a/tests/test_searchmanager.py +++ b/tests/test_searchmanager.py @@ -694,7 +694,7 @@ async def mock_upload_documents(self, documents): description="Test image", figure_id="fig1", page_num=0, - placeholder="
", # required positional arg + placeholder="
", url="http://example.com/img1.png", embedding=[0.01, 0.02], ) From 021125015b73dc07f6f2cf7f3260aff50ff09f8c Mon Sep 17 00:00:00 2001 From: Pamela Fox Date: Wed, 12 Nov 2025 10:31:03 -0800 Subject: [PATCH 30/30] Adding diagram --- docs/data_ingestion.md | 2 ++ docs/images/ingestion_pipeline.png | Bin 0 -> 87788 bytes 2 files changed, 2 insertions(+) create mode 100644 docs/images/ingestion_pipeline.png diff --git a/docs/data_ingestion.md b/docs/data_ingestion.md index 7c74e859ec..92f7b21b43 100644 --- a/docs/data_ingestion.md +++ b/docs/data_ingestion.md @@ -144,6 +144,8 @@ This feature cannot be used on existing index. You need to create a new index or The cloud ingestion pipeline uses four Azure Functions as custom skills within an Azure AI Search indexer. Each function corresponds to a stage in the ingestion process. Here's how it works: +![Cloud ingestion pipeline diagram](images/ingestion_pipeline.png) + 1. **User uploads documents** to Azure Blob Storage (content container) 2. **Azure AI Search Indexer** monitors the blob container and orchestrates processing 3. **Custom skills** process documents through three stages: diff --git a/docs/images/ingestion_pipeline.png b/docs/images/ingestion_pipeline.png new file mode 100644 index 0000000000000000000000000000000000000000..1033da6d831c3d274f01714a3ffc3f48cb959389 GIT binary patch literal 87788 zcmdqJby!#H)-DVPQW6rPlG5GM-QC^Y-Hmifcc*lBqjX6~w{&;+H*xKC-u*5(d!N5Q zxxASC=2N4`xW@xQQWAo2Fdtz+KtSMxh4^GZK)}U8K)`UJUIU-7?=!@JfWVX+^YTgw z^YY?I*;pDFo9Tmq2nEH)Ln_J6AblBbVNCUnLkx}?NybS9k>Np5XPJf){vyQ#1&)Z0 z8$wo|l^^&8P7PJsisDN#D(t&={QT0R5Xy4n_7ym!q=#VGKNc=qE?XI|B{qiVndxkY z9Ea`+L8Oc@B8gGsAaTSK2;k4H4f%RHX^%R-fCqvg$bq&m8w4wT`UDCR`{Ca4+6s*; z`lfb9uh`S`L6hIBXLt<^#4iJ5cwj9z(+vZ&tO3{h9Z0XncxiGpLDCu-@mQyk7xAUI z%?f?En9UuEF10MjawZmW5eOHKEz=4gNUuwNYUEFGzSRhJ=~~1SaGsdarSAk36sZBU zNoemgIdEf$R_RvTLZYwg5{`c*U`S-el=6ogy?n9tG=j$BcJx#)ly>=+pG`$f zldoXmcB8ZvI5Z?7KX@!Zr2lwl^K+B7n7r5wG@-d_OmId0a9rJY%U3xWD)jJ6nsbiD zBptVNpyf88{EnkUez<&MmK_wB>3e3W& z%LC`Vuh08(a4T*kp;tN)uN1y~&VY*f5~@X(57xwm$pixLlbr#P0=D@>-wJacoc;Hw zElA$q1n)tyzcYbzI|YJCf07r1{S_egi7XJhflEUQ1O=vx%UTK--xq%b#)v`=ES6V( z1S=ow27(1THK3GdVZ_}CWk`88tnHfF9 z+hiY#b_K0VZAmREZP#+%X#^&WurRvLnN>GyntHu?8#NLO$WmxW_^D9z&d`-(C;4WQ z#ShqDH-A?h@n7v<@m^J3sa}1$6ZC$=9iqj5MA(Vs>kGyMv5d3)X8GOn2kjJM$#EhU zB%dI*6IXtPCiV07(^ryZ-j?(tYm17;R>S0g&O&G@LHlHBS#~xBb0IQBKqPB zVwiDl`3<->kd-6NcrI|La7BC3dvE(TdoTMAiBW<pS*4sXN&plRk2M#0}m5h(@nr{Oj`qee~x#qqI?s$t;rwBk1Z{qtpqP zA5uEy?+d<)bn|xy);89b)$+N9JH%hqhh<9mN_a^`58|_6$;`K0`{F@nnXMZ?5c$%N%WCBjj{HGOch)8j*OT8$mmu9Oa?9cs(-vkjMny$0k}W>v4NKy4eHzc_n2 zhg|PEyPWE5nkP-s947mf@E3MzgapI|$9cp-#F@ugmWUPCD@-Uj7L!aaP8OA@mMl#t z%pRFL%%084n7=ZgnyH=hsHv($p8GrtH@7>}Gcz^4r28x3)1b@-DrB)wQ-?`_EPOh~ zK1LNrILpJj-bT!d-N^Q}TgUa?wd*DV%Px~`$|cif6SdlknnTlBlSz|}nyGqv{dVK= z{6z!1?uV)<D_mnr$_Z@ddcj@2>;DN7Z!Irt&zW?-c=f2@? z&xq4P9E4xQxFhbQ#>ZQmf~bf11Y3y423PY&{q5{q0XWr<4%B;S8g*;)#-oRsTaw(0KlMEw#V{s1P)Gml)iUCAqOS4Ds{VpgW=xTO_Qo+ORW#8G-X#R zLHxq}eep6D+coLro`rYDI=t=vrL;|zsp!_|e&~l5HVbwcYME?hZdSIxH#R7~Qgo>< zTBwDm4Zj$s%DUjdaPVtd zzH;}};>W=vJBMqMDVzz}j+On4z2v39!Twd^we3AejXS|@>jB!qAt$2CZ=Du*_j~Sh zAALA0pB(M0kt|+D1XnBzb9&SHF+NLDX0R}!u;wd*gZL)F`|w(9T`W_!QqSfM@N+(E zy7A^>=Ny89Z*8A*Yi@Iv`YssqYLVNL9vdHHE(&(WMDhlPB8E`X5}A@69nMm==8H~) zHX-NYYdM<{-Is^DAkdm9uQeNOW^PaX;1zmi$c7Xp)a{#HPClNh4A?jA{!Fb+CGx1c z-{*^0S3HHgMK58*w5YWoKIE_+IG??b=XI!y3Y~jDWadmDPKFnM6 zJjJ>Fan*6hVWxSEcP)q-K7l>;U~%Uv+ot}w4La;KM5HZ=1#bhX`4D;C6Q9|h7n6rN zT%9(-S?A(mGVOY`5#mSXuMw;f)<$|GyF8pUB~TXrxPGCyDRkC5f_|JFRQK&rZfW1s z^0d{RUhnKUpc-LSw+maI6YHw;Zv2>e=d|iLh3bkbRx`A%*8TKxk};Bj=F>_bvup{7 zH$2lP&@XjH>0CjeUtEwuezk9>xrk(JKZ44FKOjq9F39RYaxE}}w4Z`Pah*HJp=a<8 z!F@rr^KAm1o>gHnxLGRDaFb9R0z8`Q+iBxCnVVVIvN*Bh{u#jnJU@L*g^TlNh@B}ru9CPE z4zHz+J`MvVH6=AJ2Mi7l4x5dh0gDWuz`v#gKe6K)+1XjKP*FKLI#N2)QCivLv&wqRMZ&ROWV3D*j2Da0FVj>4H=hTaHD@Qs&!bB{})yZBv!w#r_`o9()sZ)h&Da@L;p!Qp@GSt^2Fn zjmshCx^@Obe{K*k#4jM=I3R$v{m>Sw`_Uukf&H?Myl?04 z9VdRnTTJv?a8!?#vS4m{JzFk@4D16z(w+V^J$Q8Q)M;|Vz&%#GFgQ%|vr~B?;YLU= zGdm@qH1AN8^!g$_BbG}Tv6px>2cJ+3``K)OlNrR|h~mLKK`bx4j9~?+T0e-Q(|)!p z2zW3L$Y>66O#Z9?C62#HZovCuBy&K?jO_o6wEy31D8(FYpjIR8BleL}*Y7!*I6lyN zF-E_(zTeZn|I8ck!hmQf#I$H(#gGb~|Iy~Z)w6s63MXlgc02{@xz2lu;eIiS`oB-q zlaq+fl4@`XC*?1Qqp?p@j!LD`)z5e)U$08O46z_WB(YKpnOeSR>0Mj1@tYF;ZPp&O z=X(5s`^!-ZeVbT|sddn=J0u#me`*AAHux34h5!}$*bCO7wfRI22B!+|b5f-QpmCC7 zMqvl4pYMt|%?xNDYft#(v(tk@#YhA1^v~L9DgJpDz)LuUarsj)H=8$AP5f%kHi@Ia z+)5-!u46TXe}UUlEa0@F^fRpnI@y!vHKy~hKF`Gk0>&zuVb-9(M-Dgp0u>-$YEK>j z^#vNf03IL_H7!~XEQsITL{Xf)f9w8mDsl`zn+nJ0pNir2l^{YptM4K@Nr-x;%^~({ z%lp39tyZV+RLc|DDEg+QUOzK8VsS9v%^1Q3q^&0Y2@!%3SoOcP=-)i0)YI|02n5_; z;8i~a;MQ!c6BF_O>0E(@x(FhanZ;|iWYN;h#rA{K7Q~Xe3G~a?J|@ zBQ^x@Pcf1yb2z89hgK|`lgr)c(0e{Ek>|;L&P}GuU26K{jG|&=&663bNTqV`d~{O& zc_=9dsF(tRfPo?XzGUH=LirPi>6h#z{w=X=8lizcjdXD`GPITB3@4PtGe#mpfdl)= z2z*B7i$1bnM}Foa%6Fk+C`Q30qt!2eE{py1C?H<65*cPegic8@$7N(=&vku|1q|PN z!M>Oo`2`_;0Gx(%lAigjnes&cqzPS-V%b1O88Yv47Z`*2q!MduX7P8qp~+%YZ=RhK z(9b<`FIs}6J@Pj%8xZUq>IKriap7e1MVrpDcxlZsptmIHmEBOD6)G;UMax|2j1WAV zk*>-^I zmX zAnfBqqXhr`^Z%kD3wr6fsR7GG_;Rh|>+9DV-JESR==YhW-d>bsP4C1E>=M!6He!fC-3#*g;z&Qa?HqS6v%v&`so=rj&Wk&5B~yZAfV)? zH;TE8U!#bnvCXvJ9yGb(l_*!`+8@l%G&&wyppw0_%W7B&CfjS+Osm>0Yg@!}-K%gj z52-pmO@Ex%aM>of-M92K^}HU?3?8fIT97YNn2{F5)L-om4Z$9}G8jrR<_&+_)lV=VFXk-oAG{#3 z63UwESu@jL!82vli^GmT0;BgKq5+zdYE^5^j4@g4%A+M%`WVcnweSSK_7Wxe+8*v< zz`XGVdqKh{Z+n#_qdDAfrh6mt_RBlqs{kg>)tb%djOB_~X?OTvLz9{r3?|SVAH~&} zPO04AoKp}H^-S$fSLkve@-HYi*nInJGTytEfoB0;LO{vk&=x;Y~&^%JlEG))hNE*v^T_4+`3&n}vy?nnl=!%7g53cz{u z?O)%z%+tzReWP04Z&gk=ewrW5Hz+imTwr>Nn807Z{_&D72t%WY-9=WbxS)n%vQEMh z_N!JtKHP4i&v&6|Vk~J^={9#A66f`Wfg_(w z$1wN463{_<9&NvTA9~v>G>RVxsovypjzuI5E`o?&Y(7`3J?707Nsbn%Fy3&`xLYh3 zj-`+-05_)_&c3%-HC(G)WzaPj5IPcmlE`+uSH=JIg8 z>0X!aev!DOII}ZRr7L2Qx4 zbh7wZJ~q(4FN(keS}W+oYEO7w{uR>A^;UMcn?$V1L}6l7{lnRqP!`EZrZe$tDlp;o_@6;ceadNErMV8*Ei4rrG* z1gsd9vYge`MzZDJAl>kpcTiOMP$FH=#&G%}1SEwzx%O)9oCWMc!VnP+Z!Uf{!OxH#!hF&#qO*m;BH)wvtVZ^^2Fd)(h#OQHG= zg2Q5cM3PUUH%1EH2dw{cnanW7+P?*`g5y|+*=l3m%O%e?$zLpXTO&^QIFtFm3yVrm z(g>C_ZlbXwkq`I`nq#mk^;zg-e%wHov%;e$_mLKirGDcb3RNzoaJ#| z>-L9wzfgNixq*EkN&;Wbay(pg7-qj*ORlg9CiWCma$tgj&J>pmgj(sW=)p3D=@O=2 zhlwPqC-{SVN*3ylS6koQT)@B&!jHITQF6&nw3KdIA;NB87z_*4Ngq z?(k;AT#|f`65KCl1Os3aPC|OCVM4{o#=l~*&sv)5r}Ew2?3TGl@>r_WS)@8KEw!@g z@&roTiGAG7m(F%u*t(1QsSWrs4~xRP>&^7rwTk}etO?gJwi6-OyW<}BMU-{1|C`Xx z>7^+n_(CYliaPUYDY(hYzV^qGB_nmdPG)zlT^cuG?jy&S+FxvG;$*X4;hQVh_R+*7 z8fE1})o?rJ*F5w^mN?oxVfAZ5ID-!nvlJ5k)r8Xj23Poq!V8v>;T9ZayI^PO9Nk;tef$SNm(W|&v->>jpBms`|R%kLval@{&e^R|EG!={i- z*{YG5R)QsK0|Min2)9a!;39E<2ksC4AYD{KIi=P(-%k~=FC-F3V5ljZ9kuxD`lUkc zfk+L6$Vg8voX&8F-miJ9hF!JZDzi)#2mo`;h-rx3$onDYFK}4~?@2`q#A#Yo{N{;)S-(tUzP^60 z%Rt$!eupVViR`z5q92>8&PFc*`2rh!xf=S6krL^iqkq6)tpM;N`k7C-q%RfySWnPO zPtj1y3-!tLCt!O_+$^vcY(#SENErY{QjQY|qvaxorSdnC*eeF~7i1a(2Y?jenN3{ASBUPJM7cg9KaE)7>##kd7k-QT^LgrGEC!TCT_HYs!ok0d zo@b4{h<{F!pPvW4%uK?Q^PF&;P(W5WR@T*>RuU3nmdcqt5JhqYGB7P8;Lpeh@v;Bi^P(gV85FHx=5ifWpI|k?$DuI7z^O7=t}C!`>%^ zpF}Mu-{`-MFmi(bFWdr@2=t@)b0zv904R|k?7MUNPi+5LX8#Y|dcuGHBR_pNb3j60 z5*Ps9N3}zQxFiAiG#J-Q-1_o|TO_3ab;QRsz#{$=Kk)&R`y%jyNd4bFiPW2)T^wpy z7!>CvZUG<#zF!E+f9IAQ{SQF<{)3eu0Y=G{{el5>{4szZS!sk%-1-md9US~$NBD34 zHjesF{DcMsmNEK*0cig*0ImNA1NiUULKT0~zW=~aKEMD*j9xGR>OTfh{+A2;FYW^1 z=YJgWU);qSt}x2>R9X7@_E?WUJf=!UcNm+vhR3!3?JD-8(e7jk*+|?=NC>SD*ZSc5 zjZ)0@h)=wGK&5#^#C}k}x)(&0R!&ydOv@3NpC?SuX81Gdv%~dz{2c3H^XhJ?QbjwE z*o``|z69s=Vt_d%;WsF>xL(%AQK;VFosc%{6c+8(wcVEh)j;O}PZeSs9T3-Kn{X1g zoKjirc|LQz5N>;)*e6V>^aN{k2EMIb^0@5^e*iG2sxe8BA0LC`+<8ySb-ITiH3^1wQC&^%;$+50($zzVi zey6&lQSODviPHkaER^G$=97Lhjt_ePd=*uD1?*%26rwp3#maQs)-W%zf*fE)SG`8R zt<9ve;S5knu9g%kmRW?eUnEGrL#NVv9`HfDz7hc~#X(Uf%SEflW`V4f;Z1Jd})@+A%uH3H0e4)7pUy@t1DT3=;VQjHM zw8~_?t*)uN5c;l@4N=id;DndMGQn@Tzhb7>f`FFrB&n^4DMIA>r|e`_B-PhBC(|3*haR;^5$YlWQzfOX0pm~823E;rpn-F zC4fo{us9y583-Kj|JPb34MNK>nnY|zw?3|F)iiEk4=XC zMHeUYk`!cyqW$+H)Gg3*MM^W27Pcu1Q|mlLx#Qx8KMa0v5lW;O0fk05ph{zt#mchy zqpLH9={1`7JBo3ssYDX+AL)CCI3L~S9GBepL#SHIVNgj?lDknA)wBWZAVuBt{@i0? zILppz(Rp)b&a%zi^JXl44?tn4M54!c+)i_57#nniP1r|Bk)bf$N|axR5V7tXS6ZF% z++Iy#Pj{oD(=FT3bf@CraGR}|u9|OTY&-O~Nz}b-QQ70Or2jkF#aV!QqslRJ<8fI| zvDD16;NR^elJEt*;&Jf0b5Zpww8i?nxr0%5i|Eyx^`@r7*4v?1TYzyzf=2_TcNavo ztR(w{$3<1Tvt_hp+x?_MnTB~s074xAiYft6Uof14doMiO(RbWBxK+;IUtR^z=PIGD zE1ztjcA4<3zahlY7Are_)SYyB{F>Y=5kx4ktn zJ!h&`s^o5zi(s|yzv@>UbEsw(&wO>doAg>|^hE+=TWN5ds8UE;DELovaU-1L)M^7(VI`z5Ur52ImvKC3R_t z2mtKu+xVHh*`ji>)P~~mdqo|I((?K?Mk>xrgNP)A@PwM1n#6PIlv)>BzwyCdWxQ(@ zpKJMFO`eEM@HK63t*A`2GH`#2OtGmp@lZmilo$|6`?t-k!K8A;;X!+iyC<8Tzy*^? z_OlV*2PINVJi##fbwy(U7_i@)t`Ji9tssjRUTFM=*vQW$5n0NcSdz25)E^Y|Va-~m z`YRu_q8PInN|aB4Fj#_3ZPD%_Z`J%I=-*%v4jnXcKoF135q00-uA)r#wg=S8tt7 zk{47=oE2n~25uR#8pcX#{hoDxxWAvYTEd!5gWf{fNOpR<&assT5(&O{0NjU5=qC9S zz6_Fh%vmf3;RhQ(QRjpktgYKGeGM!qGj~dc5Yrz{5pdoAWZink4gj44Ap$V1l;D(QKhS87EOmwe3ctA%K#OolL-K;Pss( zn$1@CJ={#S&2mFxt|{)kWmQK@Rec9XA%irF7sk|XD~p+WSUo0Dr9Tk2&bs8_2sJQ6 znPy`zMTV~*@kL?p2!n#GSl)&j3LNZ)1HlO^2fOmJ#kJ)^Q zBXwbd{E+)Zvm=?j_gC5LB3GnMYf++mMP;ogJqR3Ezl~6bcfDaLkFY3M%J`D}V9{=D zf~1fM#R3B+P&Pi(+V9A*F8d;z0Z3%4rFvPZdee;>VZRS9Y#N~o%k^5cd&z+n=?3|GTc~ zB?q&leBWK!o@n1E;$E#HFSrzvHG6k`+JcyMgY@={SJk^Sia_)ZsgBfhHI#9$xN3-Y zeKW$*4=I%6@$QsG@8sQRA@vW+#w}c`1bXw?aac+u4TWN*Wp;kK&2RA{(YKF2of4S&d_S94$9FB55 zXy_@k%Ju*LB6QY z$K*h6h0i5C5&?ArsP4^!TxmQx%ez=i+d-Pv1CQH%!zH)VQ0z9o9IGK9oqi%n{-gOe9dDp(FC!G+@ zhwD7id$HiSR6ixCg#wP~2ld(bSLf8&+lNLywcVi z6YRL`#*=rAdzC<(l|0_T9te{0q5JV2zh00|FCT;KEYc4+XVQ8|8DP1(u=QSr?#7)r zJFmCnR4r&4evrOXX5{K`e>n7fWVP=7tAYRB0+-tQJY~VoKq7hQiStm*bod{i5?_t) zP`0oQV%1c`NDJ}c(L}*woJXqZYjwsEk9eZpJNT>HVIUgG?Wu!T|!J(%J zlZN>;xEduY4b$BxmlSVO=j(8)eQgVw0y!V=HaS}YM5{UG5T{ANP6hYnPSq#sub9Q- zN|)qySJ^)BX}wwsL26UEd6mzk-r&x0Id7Yb4o1K$yF!6^+r#;=98MC)kdRhLysUDi zTXHpSvC!x|HOL?GE>UB-Psm92V?}E)qrzTMH`Uk}9D#zYO(oD9@RL3L%i%U9ey5Ee z1Hv7y*NPieJA&;xx$$DNb8D$LV^3xP)=qS_0K)6=*UsGI1+qOM;F|S%*|g3PYHr6} z94LJEWg0EB@w?nnJW6^eR*@OFqw9`sx2aUHmW`WIX{d?Kq)6h+#FYRpdE<%#jl$3F zh-f{oacL0r?!cXADX0heQt<2h`aU$#?Q*ls&9F*61WS zl7Q7xM|~f1lw=1k>x8=P{&GpPX)#8WFP9+roJ+ht;@~u9@S>lr?99B~Jx=MhHtB3# z=levqC}eSj+IACx(c|44V-1)0jP5sfvO2Tj95=)j%(sZd9H1px(Vz-ulF@Gxka>N% z5^)gKX%FJt?({hj-s)$1o{b8+uh0`{jg7rw7aoNR(2V7m}}ge8=Q{5z`yz}}G-Uytv*nPT+S zd{xnc0#{yX^*1450k>a|d^8Xg^^o}S<9bEqsqUpD4YlCj|vQL7KwSp zAGI@IiZ1!vV$TmZKyA^O=l|mW6 z+sf$^^uL+J?&Y+l>BJT)R7Og=8^Q`wi1|p<_=@3K9kyJ~0xhq2hmJ|?>@21F(UGxe z#2)lnlI?gM1$*ACH@m@-_du4ii9p8jnKU7vdOm?K{bP(=@nH|PCkD1=ARYW>q-%+V zTXl}QwYa1e^;6xe@AX9?ZDa|*cnzw#`jJ8*&j!xb?V^Xl$=aZ^Gk$(=EHCfKM-#ye zg%;^EJD2q0+j}ELVxe)nMbj8hI z?6TDNuH(#l+!4e>4IS)p3U`r=0Z8+&$4>fGYL(s{__bH)r7J5dTMvhII)hV18gXi; zu#{o>nh&?Pr0l~PevI^Uq0+7HWhFa;r77HO4=qpL=n<_8F;Wicl#pmiF<-+Dq$3LqVOOGF{fBy> zD+LpXbHkgL43cP*B2+)pY9Z1N$91Omsg(hPEJ#U{fOG7Z*HhbGQ-S)UA^9<%=bAm% zI)k*4v{gfB*k3K6b+0cESblEM>)NeNI$@i&Ow$r3vjZ0eO`D}mn484DT!`XGo<~J4 z_2I&V`Mt0_-ob2g-Wj;V0YFFVijG83xPDmL>%etQyJ!S4F{;-e)iEkn)mg@lDt{ym zNL+^NMw8oU1V`*%Ev1%LU0L*Q3Q15kQRV6t}u}?h(z0C zbR;{%hfmm^NGt)d@K#bCcD)If0Z97VKA4wz1Gju`ZPp2($!JN>R0p7UlX}~y*ZE_v zJQbmqq+7~u!fOX&UHV(=k0`1kMAuaOJCOD|L-(iUz#s?MU zjwMjp0rWrtcG0Kd11Y?v3g)kXB1XS}y^Y0}%W-qQyNOLolA{OM#>M&$r;|DmM@L(! zfXe|(b?xCOA|(Vq`a6vHy5|r;2Ygg`c~E*7K1ldS5uYySe)X6bPybUnavCo*Kf<2c zPX*+i*hv)J{MX{%5Z3rkcm5NwHM8O11kN}42DOB-XD;Av zQfKGnie?lZ_$s}2p(7zEjZ$e20E|3s3ff;&Cj;dr>*$rmbrL0Z>=kERKpB=qx{)x$ zW)EnljG30ZlmiMu|29n-38r<-$h;6Xy=(=bBKZlFiV7K)&OJu48JWBRD!_HKY<09o zOupz&XCW@>{BS;eKdk|DfGy0VtLlY!g`tJerv zX0!3e=jas;*Bi;=Q`}_|sVsjkp2e#>B$gq^`A<4`X&{`mcd&(upRZlZQkF#BUaNhY zI?S+{Q1{VtOtGCBB-b6Poa2vXZ)B{QgsiLGxUDG{FJheWKHQk|KHNluk>b2dzBI8Y zfj80@ccniye;1~v1>2fxNoLqnsYk?6I#*d*!&V+!v$gQcgrUHg%&??dKRSN4Iy_HB z!hu}0Znm1)Ow!a|%2c(@DK*s9*sRq8<=3yx*;mCp-+~#A&!KA#G|aZEIHoYQZJHE!8o=rL!8)qn;HX6vIsOmeEIGG#mAGNhqs$C3k>3S&jjE6Po`ijc%Bh_`R zP`p*RjLb#V#{q8TXH)9Z!beAA9o4GZX~%F+(>a^qI77TeQ1oviIBeX)4%wNU9!S`B zyp%LN@6wv_yYqGoxEnQJ4^P}k$WUk7>5IxDM3c-mx*gz4@1G*NMW(G-4~r{ci?I{)~$=WhVu*yVNRVGn}-K}HZm;L*>@uPY@q z{Og;jdm4)KH*f@aXJdR7>7f#a9o|b5ky-2;cphPEtQ&BFpsb*wPp^@<*Fdv;%YLkB z7so=c#}?m`rAmq-k?FCRoCJ=xgCGU1ZKI0^77_Tadu3lB>7wv_bM1^UBs{ad+0L7* z)IV?xEXu>LhUpn?Xm{7kSG3@`Y#Z#tulbpLz_FS1d3!Hvvwq1IFU_It(Z*A;@iCy7 zU$SZvGU2nmwj%n1<+wiGpvTr>tuA!9x#i`(>HH7!x%;;4+MDC|#R16mI}JWPd*1I{HrjZiJDq|gGrCz2S6Urh^i)RoOw$-r%zQoD=6;w zSTvY(rqDWqr=AaNu8yf?(|&FJWzb0|_$PwSJd@y2C$+8{poYGcT$%~faYxTgKNi=YC%{ zol7+vs5WB_^jX`tbz@ zQ<%pTg)bt+AnnRfdGwKPf59;N)r3sSAF#F{iD!xQMn*Z2H&R)PUD_w$S^bXP@Q_;1 zG+O9^#ycQDzxGDvV(7L>RlCYUmSwqKakF%%q`Td6sA{0B&a)Pbrzx-7J=?~vIN zX0t@POunUa$K&xTK;kPb;mOMT`}zC?NPkm2)|~_0=A&iD`D8X}iR(>c4eGv`DBbAA zq^%ZKZ{jhtr45H(O9Z$H$DOK&G>K{}@4m`{fo8}MHcmb96vm!NM@-0KLGe|QSm549S-v{` z###F7C{#uI=W$e;7AhCw1MLv9VkHkwrd_Fs>4j1NsEO7YwG}Ezk5!kW>rODcW$Yi9 zDZ9i{)~tIL@JLH^ruDBL6CYUWZgmhWA>AZJWfNhW4uY6_M{b$wR*}Z|jSy?Mb0)q@ zlZtqY7OnXLn5+4NB7^NcuI4xAbAJcP3Ku>|4+4YP0{7xDvt832OHwUY1r4xxD+Q$QXT{UWFpviT6 zUZ1ybNp~~1kz!`5R#MtBvirJ7zM#<(-5u7uulSsEMith-=|T6hW|YMYE%giETXsDW z`h^f;eEtjI+H!Y?I@0cb;r(y@EZ5w8wBAw{&P`#j@A>gRbKmru)>{?3(r$IaLld4b zM5f+xr2!O0%orjhg}U^W{Mg5mnzCA%5I%5<{b2k&FfN%I}fii&2I{*ySLpClU8K{ z*RxaBV7z&4Ci9EE90>~94nJ!9Wu+!KFK}N+;3IL8}k+t6rpxe!0jq;(lw$S zbrX%bC#y)1H>^dA-p^ld?L2_8$U=qn`zVIB`{%kdO<=E5v_J`MMUdr+ zl|}&FCgqWnU5+R3%=2gXhPqp%zrXgn zP%1^sMdSpi-@9NrGrvmy)l22VDX)VSg}w-c4RE$T$n#Ltrq;xI5Nvbj@(F`zx5F1w zqun{YcxYd4m5?Qh6JxosM!S$%xUlCDF4s-n6e{2CX4g-zkHDVMrCY2!?qRnuH$Bk4 zov_3%T)V|+a;e3|YvzOWU%X+|V_0O5towz@7;AZW=5e7_|RenrUKcN|JIPq4!gWySs22QpIN)k`2rQy-YtDwNCko`t_ zZTXHXgFj3@T%_?4b0=9Kp*R+P+l}F>wOQ|nSdo1d`YZT+Vl1^Q1nqXsoTV*l%w+z= z_wQTJeNbOD>4-G$gf0h)YX*!@Mpq&)T28?mW}#vFvZ9f?%WmvF_t3nV^}=~ zL4)^!B(jbMDWux+2_FWkYr$mokL#|4<~Xx`La-)&dg%lB_1_VFrud_i_tgv)y-8~> zVq4#B(7>XNMDsd*A=cp{ajM*ID(ma|C3SU6lSUa%1fa1h#PhM(kGm#Wm=a zKO4$N>C#LM9Pw6K$2W71_i$LbdEI1kX^;lRoy}8v6_cN(Py64W5g_s*`^^x){=7y( z(jSPLU6e=4Ujqn3p0+vyu_RwN6YI}4z!(frirdmD;1`KhZBpZOB_V7L5wz9P3grOj6g{Z$DEWmph?k=%8*d>`R_=!->cyl*&S+Jp6XI^7Wso5X= zbN7V6nZjfWrXX^{L7PiuJugiaf@m7NPDMhS!I77HIj_7iCQpJ(gTN(FE(KD!AU!m0 z^wG8&#ijvOqHv=BmX>&5R@H z2!C(4rkH6e5*G!BbUi$&gC(abb|Z`}l&=+U5Vqu9h~Xs5nUC`SCd6g&9Z#@(-d|Ld zzAr{;Nv2b5>M1Yjrw6^)g3c1Y5_4SUBO4wsl9_u=@xlBtnX%J3dAve*0jXKBLEfsG zuN0C!Der@tJMMN^TDA_B%0Wyo@M!x*j+fnwsM_*^x>}GVp|QY3Z_P}hpIyO z`eO7^)M^||>RP*}cPjl}8rT4EBV6P>SUyz<`P3WQ{@RyI87JdgsE@IH)(Kt;{#}L& z2bmVN>+VkTx*u;rR7+z7*WCLc9;P#YEBeDz-`l~(21-lLC53k8Q%v2wYA8AKk}${d z!(g?ydS$B@!LGi(3?dFmMxq`{NK`vE3goiR#3M4DDzl2UL+htXZF((B@R^(?@Fwaw zbv@jSm@A6e%*(?>%Ksl(Zvhom+xC46qO>50fV6ZY(%m85-6h@K2vSOSDk;+4-7Pu9 zFf<4Z-3$Zo=DM%@dA{#`*V=2bSimrQ&v72-dDQRk-pKylPObtZhe=zf{+(&E)GS*m zx=F9(M<{bfW5H;;@9u$tS+iGHp~v3Z=;6sKS^BZIvqxQsa#aiSnM9HD!0r+4k9+D; z&GIwGak^$(Yi*AC>tWj5x^S|91z`G)^0?-%>p9j!SMc+h_w$V$JBqpgJp1Peku*SA zN6GIuj$+7xUmM#v*U>V}G}C{h(|+GsiTAW1MX%hlyuE#mk>Bjbzk$M2BMNeP{mFV( z;WwzU!d(y6po_4Z+?=b2E&ooV+)9uV)eKkZN8vXmn|WFzXR}mBNjaR+f%2S081Hi6 z{-G7iqV4RyW`(sdF^o0S=RWaD=GsUC3&?u2i;}BQp_G#URuXRAu}mN3q1#-Y!IkXN+*IG0S>ZB$3?)s~`TzLMS|F@b-P zI@_FSX;ZT9W7Q734$jXclB9C5zl(>&Lf2M_ZTKoRaT27*D7eY?d%wHcordOK#3$Ih z<#3yLuPYY2S966Q?hzKv5`O;_CXz^_a394DBrbF2e&acxGIrO}&uf*QV~`1fpGJ3l zfg+#3<}J|Eu|(ntUVo8^BS2#CPd7-H13CF5P9$yj#RYfv;UL(LTK972O3^>NSk_(` z48i2r!4yn)=W;Y#Umxudqi`(cs{D@58s1DDqSxB?eqT+F084N~3CPHlis=qSJ%V7C zt#*^R@{so=HjOonST|pk`AWipQ*cMIgbzeK+pit#SdNmUzLBDGqq+Yu1cOZdV{)3W#V@I3c z-tjn#FD98Ucm*dXU+23QlxX?W^uD zY(kvF|8Dr%2qH{nD74e8GG8aH)|uoO4PidK*8-X_t;I|Ke)DyPO}nmm_0a0?ko@tj4=cUY>ew$e%f|iSV5_=l;Ub z#QpVRkyD)2oxq;v9eo-tP2IXH==IZ>($j!MVSt#azZ}U2+NVCss%6x_dyW^PN=})p zhtK8MgFN+XK+AkNW(ryOgN}SzCfrBv8TLyV_MD8n+jUQYn5^zm6Wyp_A>3}!i+{!Y zo11ItioEc{9tn5U>>q-heQ$#Ts)pO)jj1bW8A5<`=~)*1Fc*9_gvBRXl+oVy=!ib6 zC6xFOE%k=3_^Hk#a^i$;NQ0wD1=-iwv8#Z2wt-X}Zf9Ou?h332s5^Z0y=Q9-X!3pb z(19y`*S;nZzi(MApzgG9`!;?2W-Ri3r*rAS7FvUm01mE{y6+!B2f__}GsIC@MRjjV z5Wl_eE|b^WPoL7d9L_mZHs)gk;6IJfv>{kHY!|9H>XFFZHQeUm3Z2sv$e<3#TA@tq zOilZmEHF|fqa{ltYGn*J33|>On}Uaz5P`0N2hVTiS=;K}VD_+*zXHcbmlX;Do(Zg= z%pRYY;2Rb2ruZRuIb>_{8o~;R1dT%CXxTzf!zR6`8>@Leq09UYywf06{E&j?Ndk@) zt`;ygU#GkrzU@S1?)C3sJu21q!CHXp)_RVYDtC>C5pXcv+Z!=+bYY877IwTlZ)jdZsMtAJAO+hjK(-4<+zV7+m zuh6o|aXFlk%Vq~#D819g0$%=u16gz7*W_nhgXIyFnbi*QE4+IauvWcv>9^&_=-^D8 zj)#YEe0vr17unQ0rO<6}qlJD-GER`*FPlxR$-3Gsr?VKn7oVF)4R9v%1sI-U9^}AV ze!UdDa~sY4!*L1-JrVX0IUjmK{PX$7ur7xW7r_j!AVAFFVdHmw7Y2`P1Mz z(Kc_Fj1!D@gzQ=!@Zqs`bXCre?h^nue2=3VwR!qO!^Ip&Im(nzy8jbi7IuOK@1kDg zvV^w+gJ#9w&^OmFSl|9U^w2B&f+mcUS&t5STe@=e+Yg?U=>aN1&*Hu_S3r)w;gw-%_~vDQa_;&D-&wZnl4kj||S1 z?lkL7@$~+R#d@fBRyji_&J?UVTKSbGn?I%nwJlLizdYwvlH$kucJ>)rd~~Tp_7$tl}AeQ6JGHEt^zi4bFJLgrd^^c zC~oV|5Z8~r|Lu>W6CqcTpjS4*e=8#r&)9Cb_ZiJSy*|LC1Vh39AUSG$a=cYFsR2he zg)cTqz8(rH(hjSof*WHmcq2YMX3cnZg1wmN$I#-e_RL*Cii! z#z$)Jj)S@U=LepDVt2Y)q`YRu5h#_XscAP6wcnA17U5#u#tQ>>7cd5&AgU8tP7Wwc z1uq*I2$|ARapgx+bnr`= z(A@ z#s>z-7W=z+N&aWX3ZV83=iDTy{osh4;YJ$a4(`Lw>XWG2&U{tNiVpOmkMwAmzxvNI z0J;+a@)+Dp_Fgox;N3sCj!&V_N?nstr1)e%A|}t+31Ivu;P}V1jA>DZN~!Lhx_haF zhY~X`t8JHa%u5^Xs^p{}s2{sS+k1(Q%^E%VCM_$B%(=h&F1grc8no=5dU%|Fv0G|+ z;c?KnyTOdc7^?+L-dUuHU|U{3Vm|IBt)m;n6l+!pC@`p(87JGSH$-Vn+b-?#Rk++A zhm`z_IbqKT3g|o8EL@`oH#+F?msqp%H{N!?z@kaIFX~PAuuCNMa9_qP*K7B9_#2GL zcYC!@X%m(#eNY8-J;(<=x{jR9Svc$!k|v#9^4w!dFS(lq*ux=rI`p|EUMs~1O;OzF zHvdCZ39b=e+su_(I{T;o<7!`Y&h(!Y!VfC*xe`l8tIE0;tK*HlMq^i6<(8Wn z^XD~bzFW{qtHr!2^i8c=!%Uy~@?)WZz$Iz5@|lus1L1Ob$DQC-O7HzutoZJY5KP}Q zNTWn!&eD&4ykXm}mMi?o8k3)^X7c!cw!r~nqp@b zO^ut{H%Y|Bpjl$h?9*8F7S>qzRM#9osIgm2vM4%d&TSaevTgHBB#Dv3o6|Zs950_k zC@-^)wQR7yq|xJX31+_W=&-_>D48sRan-iR92@J{*4XuwZmtL{d41jBf^S++{kHqx zC&@1QL$?0TLpIZ_OfQ>jzChPtm!f9gH$vcg(^zl4P13_WnNhuN)lFb}}*A9@3*>E_y>X00`_V8DGOfB`7lzQ~1TdfJ$$2F4; z$1GD=e!pTT^2-IjnNmsGJFLS~;LDxrSM=H{@H<&1v7bM{Z_lp!u<{bZ-j#%2;JbRH zS?=I|;P#2uce^f?$ydC=(G~i9P*0AQUFMk%5b0fZO#9!umM7p+^mN?C{cu{lM20;q zkuFZWBT%Hej#@e2 zM$&evD#P7gZ}8Q@;-8q~nLZnk{IW3IdNe+`kURK@Wp&6Z9O8KSdzOLi&oY0ut+c;? zxPm^3@0#Y15XN8Tc!$F6fo5>s0n%TWMiBr z#4g#*WR6GD_o7GAZMbi#WRMO#zAosYimhel}afCDd(;cw9=S!t*`DZy!}fG_p>i2({6AM=&vB#=C$Gh7FYeb;{)wE zo&_$Lwdh71w$t>LOm>&eW6=C%Gt=0SjHXp@|3~`?Tk_c57%V7tiyMO2lh8c;fsHS1J(uj-ok2L z(Yn`>;$L5w4k^>x4DHwbvPE9``|yG1z{ilAHSM$SMWB=qR+2Mo>}3uGLt@Ym|Mr~# zcz)EzTcpIfb4_1o%kw+C<;SJx8zIB7M&s2cu>pr|JOkdxuQ>%aV)DyWskV0eI<4_6OKTb}+IJ1vMEo%FU%${J+iFYc$Gi@;&^-1+uZn2hA#6w=1%WY2Qk+_iZb2 z`T7RQ06+5k?+aY-J|;I(o-_G>d}M)`ac@!i)iua+QKj>pL!g&Ul)r#!T zo2N5Tq;895i)D*P%h#5amLj0J3N2XJjZc^*t&Q#4wRN%F=6s}W0d8V{Exvt93VO>o zhjuiy34=4Rq+#f@53ms4S}o_%{s#PO+ym0p0zXaK{Z^ZpRJ(bDlb_w#rEYrme!v2) zZvQ8syVy7JoSbmea3WE{KkMPMJj!Al1x=pcW7~nPU0J;V1@QD@ZaA8-CN<; zW{6{3v8IDt2CiTdZ}{6{2ZnPDtAxR5yE(sF>o3{6-#67`%3Sy+>Opw0)?;%Ep!%QL z?mtD;;8Wo$?ZWkt?Cf!iuccc~Q=i2x=|}|Y`J8zMF+XrJmA3VC^Ig79fZIVLyge4m z6;WM_*wS9d`kV1C@)|m+9K|{8x;Jv3Yc2ml?l^JBS$z|AzL@>R-sUmQN9(8R3o_CI zA30j)>IbcMe8(EMX;s#vA@Ks|;(-RQ9@Em5vIcjapZS%PSSi<)exPV-h6Pwwg$I~S z>11$;WCk+O&g0R#sY(#v6J?I`4e}S3ah)|4K$I9zSL|VF)24MS>~g3>n`?^Y z=89r;I?)PX$)A6d+cJmqJ9%`MpZGjWRfiMe)i!nPD^-Q`Rr zX(7i*=(JKpFL|oXcH$NqAk;B#9#dG%K222_&$HRsXTy#-I1jw^%O(WyA?FkB-iG*S zCA_DPrh`jl)?7P*)lMnOfeA#rw6CYcNk0tULoOHl;M#5`sGF=zRL7%-$CvBX z&HGa@XtxU!tJuiaQI6epoK$6eC$@rmaw(5-ef4Y?m@xQ)_0IE0a+rsJh5Gu8p zWX^L?s7+YF(bRbV^!Ab#7S?xk8ioXN4xqp2;g^@SfQp*-bU5fznk`byga{Fv?rpHi zrq)83XRH_-{oDGnUrev%L7E_fFCs;_1dWz4xniU5zC(Mz9M`+*kvd$K&Q`A~_(3Yj z9={m~UPTs!3_o9CsXtiChAzJ{`p`c?a{A*balgkyAdEDEAAIV#{Yb$&=Y$U|i zr+DuQ#me1hpt4-tO6y7}vp2Co8;s283uirofm?Ex$9)W2a?#6qmwt2qGSM)YUNC=) z#~1MKTCIZt$4RK&U%8{^@Q`CKgEoUHx1~&EuCN&~;o8l)M;66MRr0&uPu{1KVz4^O zQ18}2rvisJjj=^yP(^vgY3U<~C^oUiJ92cqEGO-#nC>f6 z_Gi`!B2&k)*sy(xV090cyGA#jQG9rlNo4lzIJ_Z%kK0T+!6NbI#Pxm<@E?#B@-(Izlh4g=l}S zNXb_>*|MxIdxJbm{0~AEVm;OlYj5zeoB6)Ay%IabL+0yZC{#9vZ78qi)6~F zUkJn2I5T1Ysh>M{pcQ+};5jm(ed%*V=3HTK*vSi&&1xwvrp^Z#|KY=tzd#)E{251# zl=x=NLL1k>%;4afRH(~FbMm-4%|y+lX*;c}k&$K@!3b;zB)(d*w>4Yi2W6>uTS$?T z%rY*9@EZ~%etT^1&sW(P=(QI5_5Y=HF(yN__$uP2k@~y7QMmkfy*^7KxHE8Ab4cZY zoM32KoaO_iwo1?K?`BsdJw8D@ynFpxHqC}V+q&M{h>eIN)DHc*W~QXH@6523-{j%; zctq)PCwspoHfd%auH+~{`1+ku-$j)6)2tJO_*27OzkhrhJGT(EetCU3?2KP2iWA+G zf~9`tH<$GNLxf-c>UBa6m)Y8d-zfGVV^dsip6*~GKTw{ec7@-qSK4ilrSZGMJusmO z2&p>>;dwbhF;+F&Y}E z7klTsm_Gih&Zu6vpEg2cA^}G8IQiCoq_uK7iYGAVv;y8S<2^rwN|Q_MQmJRLd)?qQ zGxEN>%y{^i8l90lz#de)T0|xo<`D;5a{64Iaha4nNU4%pVOvDhc${~o4#gz zsh3HeBQ6#$5k;M7CWpqFi)WMB?(W{b9ai7wOpV`P<*ZR=$eg-rUh4Xa;7+TS0;^v@ zZ`MkQw&1fQ$+iOFmDkwwqJLI=n@TFIJZevde2dxIR~_aBNno3RSMqkTkoYwl+%2-o zn?^zxkYhc?Bse=6VsI)yxv2ysZj$kJGk}I>!o&h_sco6J91VCB?rY~nkI2L-Fxzeu z0q2^>72bdvzLERsB~jV=-fJx;ExP#aq`;h=0rx!j&L7gh-cGhA@KjI>Z_&`YaJFXQ zIX`!W=|(lp>>2ox9lWii^pwq_0r)aq6WD)hdRVctIzJlx;d+kxH{)Q*410(Jam0Xq zH}r7VUTj5L1$DFpoIZ&T2$Q6$D~=(sYIIB28!eSvO~A=TB=UArv=(N zdiXeM`6dsW*@|OwIW1nM&@|0`)CSKjeFodAV!g=0ND^JlV(C@w^3P4mpc%2zg<^@g zQShUNmY=!D?6GOHTm`9}$0d2~mx{=_shh9x(egj%t8EQvuM>9VR1SVyIPa!=0^wD1 zO5QM(;)7&W46dZ&&AIG+RX(hD3n#k)=z(3{?MYjx3K;gurPJIH&lES^=>KKac`b($ znB0m{Eg<*`zOsHAO)Q{R$3EE&RJamHrBcyiw7%KhnAbJKd~@3y@fr8)ORU1dE@>p* zyHoKQ)e2#_7Ukn-{5W1$w>nt6gleO@7uMF&5a*;(TOwcf*9QAdRnD>kOHHQYWvq>2 zijC*3dgB9}<);t@z$9Wk<-E902lM>~|2#+?Jb32({I(Z8B#Glci>>2EpD zzp&;aXSwoJJ*0D%LWZ>H%LJ+g|NW?{9pwu?Hye$in;FrnUqJEsBLx^W zCss~T1BqXkQaUE8J($mf;gGT;7dsC=2=lS86?EZkDuXEF_E3t#$LlYw4gOttmavxd zVLb`PD`NjvqoF>sduOk*i3IxUIr}=#1x&?RXK$W*U8R*2hexU1fJiQ^#+0Md~LcrUtw)M|L^QT z>Q|0nIkPgL_wsZ*OlwTvVO8)nyRt~bhOed2XHN6|Xj%71;J|op*WGyD#{H>(1WzI4 ztDzaVrr$`IgQFUD^Y9d=Ja^<-h$fM~U#FX~^-bB-JiJ}YxIP?v%sNb2s@T98B;7Qc z;ud&0=WjeVKvp~7huCe&kY!CXHIYG6S^HA6xx!u;7d)YnIO5!Zx4W! z{>-p&{8q0O9QqBb!haQ{Vo)DXZ})jx5BG39pmI6oySKxiWujEgJ{6;XD>YHEdhe$~ zwFY5kQyMNcuCVVhgSlDUPw81nayDzlqr}AbvxT1)-*?rs>Vawn#&;mHtcxowR>KKp z+Co7#c%@z{;@)x)T{f^HVStg}@cN~qa=fJ0$gs!zp|Dk9J6#WLk9$(VL-yVJgtDVw z$urQ5YNe$*cl0)|gk;>-N5(R+=MWJv+2DgT&}q*VST~B00MnC44A-n9TnoiuJ|<$K z^2W^lDc~2dA@Hdqup55k`iS*8jN~#Osj4=cc1gN%2ld_04xFlv-Oxm`PUlN~^Pr`n ztrhlDgIXm>a<<-^SEZfW#|mMm^tm zOUPOkV{-YzbUiy392ZRIG*mZWz$p;CEOgB^_@4<^Y{n*4K#DYP57P-CFxCY=aHVOVE^6iI0634 z&74O$H1|LyFFS*}V>N&4>%Xh>rLYk`EJ2+l>hy5CFpuZ9ud6~;F9g|3DuM5%E%x;yXcoD@*2oh>_4&nVGfZB*FZ z>CI;_&DeS_x8!EwXv-(24i9WA`ZH0;Nmmzy7D;gf``|ll5jNQ`(N&pWyXsdJ)_CnB zUn~C_Pbf>Ko+l##%HM<%BBo+02_0`OK2aDAKh}DZuBoW65DCY{iTpwk1FGro6+}cO zUP}VZSVS%){-~(D2e>*k_XS=xc^;kijd}q(+H^96KNKP-h7zNk=>sZi7E zJ$Vj2f@br39>K?<&}`2ML2g0sc;ucapU47FqRaQPoSoat*q(HA#Q0i1iPt@vkcRV( z2^X+x*Q?24XXy}38U!5j|K~!XLwuuvxbIZOP9=i)e_bUwB2MZHi(TkCarz%R!f_BP z!*@d^LnB`*VNzq;$qk^awf%^Hh8ibgiSx^l<|rDusUd#=RJFmpc5VVng~>?rzk zn@`ON-WIb;{N__jX6ZUFQWOv=q{%DsI5^kE*kfX#bgUmx~3YXWw0KU4hklS{K5I9)uj4oJ{xaR08xOIJ=Q*kxmkaPc-f6h;0D#} z&|0;_{l(%HY<|s=S8q)Vk%#s3)ANqgLHdMOTZB^w&Elk7^sg+BE2z~>x|ytQ2=Im8eB; z(dl*FGV6Rh+#?ng#LJ4jL27YSdsnzx}p>Ki0m(SKZ+`4b|PSpeSoVvC|o4?z>;o7_W{U(Vr zg?4Pm!1-*3%y;`F@*@q_sth?c$_2x+RF2L^DI7w4a_m>qBC{0io=s186l8>&;xuKX_E1B4U88rfn1tqqThNuf)9oaBOKvC&c*J8I>gCRN7Fdl5ev;=V@hQEiYXV)jB9y=a4;~}0vg>ue_0cVZ@+Just}VLm zj(ZlAyLlz^TpP4GkrLiY_*144^QBN3-M6SoJvxRPygV~q_Ls)V&h&1_IL1SvL6zOz z%EcTs-)k`|`t}-Yw)#b42|Cu$qko4=M^@YDJ5@KsJaoySh~RqaKq@QZ^LISlK-9rH zzfal!9?eSkmWjs;-D8==k6XQ{+#El zH$YY#t%cjj$h#HjgDMIRoAVBCWf2!8e|VWwr2jGfoleMee)Q_7s5AwbvUT(m8K$aBQ=z=QRfDy{?xyZ3#>R||87=qz0? zY|=*z_{VvurW{%d3tgw+O=*^p3tutm6*0&GV>mb*^+h7F=ifzR927LfgNEF?gT+FN zZ--OP7U$+z*k$0R^SmGvt1FlF`_9Lay@Di2Ndi=O@~ad*4vI&k@<(JueJN3Cy}u_F4z}yeRb<`S=d6;cJEM;xDmWk ziFxCxB+G9odXIjytSl$vAX(e13odToOMnS=woXJ&t!&SCTW=aOGT3SdGrosQAqLC* zu&?H;*@(K{y1m@A08gZ6Y0zS0CFJghKK%o{5Ar-zgG$Oe@^`3f1Gm+9n#K#U1jJnM z*${54xPCV07yi|ryDbCJ#}U?3r(ui*+h0ukGE*b9@Bl9nl`C-a1X4sQVKQ+USxhmr_FI|*vhp*ebx zvn1B1D2;VY6^|UGTE8#s%!XF5UgruSdb{pc_r8f7I=@g5)-hBgKx}hpjFV6FGj}&Z?Hcs}Jpq?zLq|RpfQ|S={ipKYlZwJ74_K z0aZZ1qsnqzh4Bp@OX8{n{q6bJ(OB23kt&q~cRXQPa&pwQuRhAjw&RTCj6ng!^cH@Xe}O3)*&e8F<&=DmAZNdpc$ipq;>Ux+QV1)9sWkc6&ZFo;@<8%SbvR zYD<%Kr-43fua%md_CDScroUIly-@;fEMjNT;ZP-QE{xL9%o}^yt&+eWqxAHwG)BUG zQ*SdG>6S8N_y0WuEy@Pv$$CuhcE~=h;%F2m{U&7q0&IE3<$Gm!3!s&1K2Pd&X3Ea1 z_rzY7WS9;b=q1aX`i~y*?7=qJc}+2AZoQ8Jn&JB{^K}Cz>>I&+^k?2MYRH*B;l&OE z;kB6Aux;}7z!!H`deh0DNa@W~-tV^LG}(jy{c!$q9zRf=I@W^th|4gZ9+vql1TTYM z3TbSjhQ5cK@ON)=Pf>?Ys}*St-_gp=Y_OKKR{QV0vG>_sWYA-X{(m6vL`y^1=9%$=-)RPEu+$>M{A_&vejqW||{VOJr!=w$ase`$T*kWof} z>LuOf{Uw6gJHvM{wpl!}3xLhgk@=kjTkAV84h;g(o-IIC;M3H?lD}o~4XK@xi{8?o zkSY6dr%a=r!$ex?3c0D!`*@QYx3O)S!`)_>`QLDSkaXyF!o4Q=bp-L?=0wo~3#E}M znzW2Wj(!bH*;FcOIKdwD{Wy&pHjK)@{-~_TGh#>i9|Kb)7g0*ga!Oj1dO8AEp|k-a z7B08j!y(zN`PA|l(U za-+@;0MsOMwma$Af!tb*`tEdNh4b(ycjSECb?YlCnVb`SRdS=IH??g-#{5_PqY6r_ z$xZ0J0rKxxLvv^z_)%|!w}exeBLBN;6Q2nx)FcXBD?FXqb>x3YMBCYd}M{Jg47u6h+8d#T)d((X{xdf&*pda zJel}^E^-c(KQmMuF_e1iVnIqb`avg*s7=@e07LBpV5lid8ZXf1`l*U_&qxr3t@Ge9 zS5q-Krjf_lPN$!{oM`tNHa#&-30LxQ!q=jqT< zviLm}dwP0mjFrGIP(y*LQSaB`Oves-Sy@?%sjA}K0Nr<-Hsf)g1q+u6oBj!PSdat^tn_Qb?m-y78BlPun&qToj{Q%iTp^s-@dAbd zvqOMUD86MigjVZdlR5yCXR!y*o%_ELek{rmc3 z>OsFfYUL_pQCrg3La63b>W3;cnm2kAh_}a**oC?KeTQF zduFYQEI>s=BnqGPC{^w6l5J%&fI?F1c(}9O1EdoSc7hO|Z=FvH2%%e1^Sh9YF9GexAxJ!cf{H_3Xl2iPLoGu*L(Kk4_989iezJU*% zLgi4JAQ%P-6ezfCFHQj&pzU?mkgBt?8Y7g@1mg{Sa;83J1f;|c006d9AzoJS#DZ!A zTgq6OnwsW$r+w)d?g>Lzi=g@2+NIlI=Phgfccy@bL%*gwEOhp2*E(}d2GlSIAo<2U zy!Lyo39-91*hCEk3$3L1wP{f4f&r@2@FmE|u><1Ni+KiM&h~!C-N_%e{KACaY{ov< z)ATPV(Q;WIFck8qL(#jDHEd%N^ z$(r3CavlNW2uM+&){g}O3NBp~qbW2jIyHSzZleyrtKK|-5jg~Ic74J9>aXW4z!mkX zxziPmVD+*=S4so~50?A01iV$(ZqaS0^1`xX*7*RWNil%y>lT>0(pcsr%ut zV6`l8xTHolp+=Z9Etov%hBHKmAW-N25a66k@Vf{ksW0cyvPaW64sa;*moK35@`p== zbfA9d)+V@dG7ok3agfw^EJ@>+G6k>mMyT=IugV;B_KcckYzn>KC5siaLyFQ*VnnT8 zn#PvmF>1<70j#Qwv17pfP$)hNJbKcMSs8(}`(~0?c_8AuP$5LnAZvX`U25?4%_pwI z9N&pL80Q8SbuoP-2QLa&iSqXDL!ka!+yMg#@#gbASfw0Pbyn8IU=tW z$X&c@X6L<^EN{=YU3SI@&!w|SFIUxgAq8fE5N$EB9_mGa>~DvJ+qA-MzjTpsxhH$V zIaJtqsib6(77iQ^Hj(}~Hv`NQ$;z;k1aZG-@18b>g76>+aT8?0Oj{mc*vY^4(oQ5S zf_Z$8UZWTMxF5vse@L{~aMXr-s{Q~x3NDSS`pUf$V7Xda`B4VTlRKTPiEg@o%`*a2 z4d~T?I7|Rkrjv;3dtM=PAFu*{se(4`-yA+=`^O^HV8_6MJ#IPk0DCF)AgI*3-Jr6h z;#L)Qq#&5JebbU%`>jm>N|vL0oX7He(E3O3fv~N0-il=DizCL0Cb1=;wPYXtNHk*q zyPp%Un-%w}U#(814eyz;K668GLS&HfhYueTxE&4khD1qc@q>G@3os7y^7F@US^%LH zQ}>J_T6F-l`ofU2giyamnM?rFne&&oHsC|97fVkhURrxiqi6Cs=dGhQYC#WxLB)io zCLaDymMlSE%{9YSU=@En!n=)rLvNY`U883o>01ybi0OW!UreuYp?`@cKGSy{=iF&M z#~ef)e1@>1%<^V~RM3YYY#|+Mv}1m?o;ZLO$Ori$-poK3Ym55^1~RhOtq%c7FCGEb z^C?#>Dr|P7I;Aa!j32>(nAwNJGWC*#+rCUze||6a{MZnaHN}~S+#@1?ye;t`vIJi* z*7yy9b{kiRjQyHoI3@|-Y!WBn&KDaUbID`K1O)!1N`T{my}eMkqN1-V56}#g0bG$n zCz``CLC?irajwnV?s2L+UpgWtWJAQ*KV9cpofD?GQTK*7VAe`%WCaWe6XD+xFvISHsilD&oly-yT zs)H1I*USrN(#!RDh?*#mNE0F$)|H5Pot)DLz=lfz2C27yGwGcV$uw-0LF0GY!KWMn z?*f3adIrc2rNIQ)6c3m7_F5vYyj&~)+QPHJh$f50>%2UtPI=v!I3!N@@m*`)8}DRZ zD!Y4-C}9p@3O>lNC2du9KGXziPuvqcJQqcvbp+(O#u=FMdt=Fqoe+BEFr#QNXnUC( z!KP>!B?E3lGMQ`{TJIKIl;O;#pw-Aa%^yafv?O$1RGuC$`Y75=tVV3SVUb_)Ka>t%^5RRzs#eM@u<6}YoJobrGMi5|Ikr#Ne z3pDL7)t9cY_Vx;b3)Z_yEbCO%L{UtU?}FL-9J`@-3&@rbFF-MeNpdFL*^k>AwV%k4w|w{4Be zTt}qKWPv&RvsmVCod^M!Ijejrje!gcEnTnIpn9{si`=Bk8Kt<#F~lBe7En3s1y>fiA6~c_TwEDWrQWp~qeoN%DVt}r!0f$c zz_S~QE~nm#o-d^#DAW%Cub1ACr+Flm)v=*pfj@_84NHu5|IX?&yS~?#Yov#QwaXve zhsqcfXQ^1Mb@vW$o^ZPS-57t1_cY=^q365_DyTE&P(iM&Pe#?sIqspbwMLfmiWyA|R{v z;p@?W!K%Yx3`r4Is6UUvz|`urP-;r7&`2mMuDtVKT>R+b+hB#bIzPQdXwuGy;Bd%4@E>y_HnZ1(36H-CEe>HeaZ(#4H)4|0JgO61#_WAK^G1U}>hqT4yV=b1I@C5XL*z zKnuLjxrW{uKh}RYbkB|K38B1rU7?Dt@FXJFh)dh*ddCf{Ii!ABU|%*kS4VVEB>*8+ zzDRp8SqRfc_YvcJL?I9OC3-j*iw655YTPXwe5yxq7eW^K&N7-_yXwbn@Wz(n_?qs^ zsb^n5czf|;ZvWEyIzMFqKUEhU{LQy~0th*#Z73VbU*A3rlJ=X)U_7X23pYG5%kNKq zrirz=kUqyC3mL`3IGjfB2Uu!38wAGftz?G);M}5#;xE?Ka|;|zK-lIZ7?t=`+%2Q= z($it|U!P(7?cMWE2GoFrXQ*D{BCuCYIZZ+m!Fv3pA=di1Ne6PPiq{C0`d;q8tbMje zlE;Q)>)utD|Ao{7YDk0a)Y_3~+RhwL{)q4~|r*;oD8 z-JwdifcQ6dmF$qAw~SxlNLB9$iaXf=@=MGtZr8M z6`ht9sk?tujC)gYw;ZM+&ZNNkx7#Q}5sHp3zvjtlU&)9ZFO>2Rq^HVcklT}xpg~kp zG2158t$s&zNYlh>c1dU^H+;pU?!{5MJ|;2Sml9TxxMQVwlcmYRdm-meL7$7yt$a?( zK||#!OZ95YTR#8zf#o~f6~xmS#;Gm9ez-s3J@j$hlyae=y_dlCM9bUh9!8wRqPsj< z3aTj%j0lRN5Wbme%Iwezq*Lq`#gmR;Iq^~HMuPRWt=*Vx^bJ;#anjurXrWQ#<^`4d zmh8+ZX~5&~NvSG!z=ld}AmI;o&sk2qL}6KC;SSGP6uXg@f1#Jn*h+?zcPq^5ygK6t zyu(nKB1Rv5@%k_dSiwYB%L^=m-QABcsPh7MMRD|*Ffpw=?4sd8msA)CY->(~kM94= z0!VpQq2U#U*xeHN_>kUk{&fybws@zXdT-2|5Z;+R{ZsbCQypTYZ!0-DQ)$SEF?Q%o zVYv6P4(GYAdeW~p6L$M4(NVlBBkP$TztkC1jlRZ0$Rjm*t7`hsb+_Ndzje=}rr{Ht zw-cn71Cby}2mVa1_X_81&(A)&PxxqqJEc9a)X;4X5T zIw!}{sr=xy*g$Lewo;JBHHBoJ>B>re9i{oFKYfXzQ0E)wHxz@4Iq?du8|E9wzFQIX zP!7j4)7FayHQwBB!i=jE3uYgpxw2GtSKkslX$Fr{t#KpX<;I4qVk7(^3j(pU#hpIe z1LOv{4G9Riu|{Q96=Afx-kxxgvKc;JSa=JjgAX|JT@$Y=>)S&)pCz)dy+4TYUw*ZY*I)}?2nwG*83>~|lGZJ{>=7()O%LXH z69U+9`%&RFA5ttw9bw*_o?|;85n2zKI0$oCJ8Jnm+Md@4Xh0n7W z5B*F!)MzZ$zxU3Kg)#Qt^E;G3;qz{1hrKM}hhH&p5_FQXsqgR>jW}zd{DVtYmi!Sv zG=(o2Z76N`om}@XxyRz8=AR)K{?6%TD>=}p0|i6qOr{6Ju4~GWTkm#Zp-Vd58w#s( zA61SIbT>Ozh)$RD#SLCIXpZQd7`R{A&?mu9fj{`dEc;3_%b#pzpc%60IBlP0opV-{ z^2!+jE`Ise`3N6tNqy|3PWZ5q;%n9`&v|xiTO@9CFKy)G?O6 zknG}TXLtZ1MhNPi{a;tHG#5_5Ah$&MmZD@4E=mGy_A0i|_)*Q1o&~^~-gY?&@o#?U zvHgU3Z8~ufUbM-L%<3y(J~;Felm5qnAY8NBFtBpM7PD>rG_&iuNd%8#@1@B^7B3Ad zh!dp)Wi!Y{9orsMu|lngIKf^8ZDZ+i?cY<_$fU}iW375`KdGbTn3l`;w zPf!j)o}*co``04reX^CbI04@n7}k(?Y_PqJvEOkJylx0zeuIecQ|LEC#y3((o#EZ? zK1C*>E86xOm{TggXpHg1)m~&ME&EZMl-uf(rNFpot3kU4cy8ET&x+Jw_x__MKC`Ex z(1PaRC0}eNumR2kZbJm|r_!^K;mD?s?4Gx;&lmH>t%r$T(nrn*i~;3$kF57AIXiq3_cZ3uFX zpLLr>vOCz6QBmcj9vyy?U14}pIBG%t4G?BnO?b`)0qU^KY4;a{hKp6_NhO;=K|}VT zZJoU>5VP6B>#R!7@yusgQO)Flc?>6$JdKJf`{US}zc?jODE&cN_II~Qs9KU-{abeU zK?54&uP;~JfFYJ=7I%N?iGnicHO8-UIek?w)Ek+AdNJ0O&SR%>THR^*|!QL^RJ_Q{|SJn^og4o_Q8u7uN#b_uF?*_ay!+aYjh;!6oFQvl-Y%d<$z1}WyaTG!-vS2ljT{P;2t76*6r3tg zk?|7$;iZjrA4ZK-PS~~W#DBcx1zMF3QaxxkT+&_{j_P(d-0*zVK0i5-qPm6aPR06Z zd@m%pn0WBqHpcuh*Y%-*DQ7dyQxTOX$qpfBtXx;5l9e*yZSk8igT^iMwHU;7$>W!w z_3Kb2HIfWV4*%W{p%;)T`})kI)`~$&(ZkEdaO&%TO#BDy_(#GghMBBi{L)1R@y@v+h1xAS8ngJ`)K+cCU1`k=bp)4Mh7~g|FYBbU zujS(TUS`oRc_Phtiko8Lwi#hl*e4%u=AXQpJC*@nr_(Cri_VcFHFaToHS)F6(Q^9c z%(R3b(zNih&F6V3ehz}H%#-jm-VD)i##G8#4Nq?e_4Avo8@4T{aF1Sa)1hUeO7X!v zmGzbFw2fOGWx*RqNZb;|-!nsk6l^*>S^u>EeJqR1qA%SNNEta{w{_|&=`9a)MdPj@ z{EA8s%g!aK3-cB9>Y^3R)jX3jX-V{Q&3gZdE#tvNqiTz4m%Egl#FB|RlYsMgWqbWV z&j!x?+S0oA@?EX+qzs1rpA8Idjq?0tX=2lk?5 z5`wTgvZ<*ZCgYP$5w=VW5&55b9l<_tg)pLPn;)bHV@$QrBG-BCj1mT8jG0{c<+|01 zNI2PlaP*EA=PUB0xg9(H{Jb6FO%vOEK6H0;j7L_*Fq1xC%4V|CrhBcR%NLi)z>fCq zW8*8aZ=i4!&>tfRtHM^(D^|iw>O~YkBn;hW0BDo#7rq3ljctBs1C!gQ)qIsJ1zj1U z&maWgh=tFp_0C~K><}S0VV3hS{s(V&>6ZeLW_DdpDBSKxybGiG!ezV*oRmu@FY#5s zj$(h>x!4Dnem@yj%F1)y6?ewRZ)yTV0+LQ*($}B(&mAwOpQjHre5V$tyNF7jgyN1H z$q*djjZXJGr$rZuB2&*;mK+!pX0ug3)nZcJ{73;}yt$AgzwQUG+j56}DQv$%AMK~P zhq<&#SwthW4eBM!Lr(pEQ950^WWAqe_dFi;WsRuZ%kkWRE|oMDSkKEDOe2|=g|`}o z*BCcAx>78xJ(qeqtOSNnA1R1ru$vS)ap^+Co7XNH5SPu#~#8Z%^f#bAS? z+zd|r@j3sWqD$|BpX?N+`1mhLYr-I$xg?tTQYmUGplPcT_YX5_YUrRb+1z!X`I9$2 zW{8G!35Dj{#Xi+-<8oWs9|~S?z9@Xp)RcL}EEUfB&d4~%tzov^7$?lER=8|B=^{61?8wN(qAz=6zK;+@0|jFM;~bdSjOmn(j?_Q~XEC|d&tFl+Ng zrCIy1d*-Ch+6#UaWrJjsI++bEsp(bHZ*PZMelWx@{N8tT9y(s*s&T2`yBbtNnqhJ| z>2~T2p_%rg4x`S4@^LlM4koo`d<=Ye$;3RMDU4*;IWZu8DGYaBGy5l73!838FVumY z#`nI!JvX$KG;BslIsCqznf`Q9LOZs<%~6yy@=MZ}6C%gU&C1gU0Ae6z2s)z@j=vi= z*>E}ZLlks~x4g^WN-vAGB#J?UKpoM6Jirq9)8p*uL=2r&VTCQz zV#ycpJc^lDiV#U=F+$`KiA&&3DFjcTEPEkNtq4?1EGqQcgUfwEh1)_uJoVdE_K_zx z)6(1Pj_9?WyJ}U`b}K2GIn@OEyS!Mv^3BK!`Zqt822A@undh6IFSwzmNobpej9W@H zb_Fa&Vz~Bjz(`v=;n`eHKf!!gdq=*iK9P_FDYKgMIS81p-yWM zl{<~96XD~1YexyJOMZnW`}f+or%9Zw+~>ZeF;N_|u9k!M^7BzdOp_ z*DAvin`+IdW*6BDCKhn`HBz8F zHt~2tDllrZ<;&P25v5NPjR2iUT8gb^8m!4R`KR zuy_34S)hh@atXOa&`zBiraO<(;v3p9<&o72JpPe4AD*MmxO~XzbN%wjRvImO-$k$~ z>Y3%ycdASlC`bBg5jAB)D|PZx-U|t>_92lvi4m;?SmA)$?7md+W(@Xh(P_j@gc9m! zvN_?6FIkQ`B&%Prtpg5+1Z^HZkI!TgoRsQfF%t54V}U1S?bdG=eclm|c(rL>NA}7& z=lFA&fq=@BN3aYOp99JEEarUQ95;e}8uE?h6;K)|8OmmzZ-@ z?D(#7s6TQ?4S${WCYLF0)vb-7E5q)hC&St0tc&!4pU49-i=>8o`82J|2O4AZsT^ow zSIL|t0s%b^jR8O62+Py5t_>;up1sBLaee!|HlI4|ZAJb=QQs3&@6|1iG4m0>P5$Ra zepllj?Gv9WD0dVUH!Sa(xFwNv1ME1GRQIpeQuxO%+)Q3x#oo6|VubLYa{6L4cLC)WM(y5kdeteK1 z1bS+_*89eWAU*eI<8l#}h*nq|T8J)2euZ`j*AJhph+S5%GoP&WUxFoYKonnr^&2I`Ju-M)kDcbE60z327Vm%EsBn$Jzxm z%)QIqfTQ-e(NlgJxiDe6MI4dUSVBW-=Vn-4HbZiNc&eg>frPB37?{ioVWRMlAFfQKOQl%9hO3DI>k=(~xg}X{Ka}QNt z>jn31blyuvC<9hbFSaTtx3gl+Rk)9KB7lN~*U{>dq|6%r4K!&v;|B-lSFv$_4S~ zPAzuMNjQ%BB2T2U@UAIze{I^;H@S>g8~YrHAGKkVLhc?FjWC-2{4tkQAevskIn2|j zgw!rh&HJm`DF#7;0C#iwy?T)bgW=r1AOU>8m_j^-LF<;;E{;dt+pB^G@da4br30P z-P>y?!14?$5GY%HIlpfCCmt+|urDebJ^x&pz)$|p8fH>b={q*4lG(s~7rC?U)Rndz zpoEQ?B~9w3V@{bI;wdgw6bFQ5tj>b&*$`lz>sJXpa6rhPd+{XrYF96~-%w{y7vjD1 zH7^+hDGHmb7#BkDH9ZiAY&6fUviaa5d*)#3tW@WGe=J9(W^Sw~y3cj*NzQg<^>0gx@yt>HdP|_y=ke zV*6hQos5jV-GpBhO_dg`ZMqus-4zG~A(c|;VL{bW3#N3l95g5==j9LIOVmn^HOfTu z)!I~_f4@x?t~PY2zwOu6wAiVsoO0>>`C@RQdnb*MxY=l@xQO^LP|&r0ZoValkR@3z zz^k?qr~&aJ%Z)o<*$h1ltCNh;4Se7XV_)AX50}1&Rg`D)WT1DKfwbYjS1k z&24Y*zz%`U*;-H}N$%d=2*HCS{+&HV)8~Ae2yEHoI`Y$@uTX-z{XgK;49&T`muA` z`uzdlZo5uVgZsX?3es!R$M1YlI_f_!0l&n5m#Irp9mU2|hpFA5ePA#+4wH*^%~#hY$8ORO4@$_fRGvs=sewNX_1E)|oxgmvX@`5oqFd zJotG67JB`_4<^f0tAwTQ9jS9+=gPN6w7xN1`^f!O2fIhcM1sHQe4hx zD{8bqSUDOn@`Tr&oCIIh{`zSAU)}zh57aO}2O!E%|1{NAlsmwNu(Q zO^~H3{ImXk$twgs%1aPAdj+TBT=-ydGV!(_`_MyqH*S=5W2d6GLuN0sg2Z08+&=5Z z{}-3%i=oEXXL2+55WZj_gfJk(qH6i;rIgUMS!TJHG&24D6|f~5@lkrO*F?`A!^a^- zC0n^OC#4nl7d00~jeVCTRkB=I?Vq6r7uBl*ruKheLPXi64|loa#2od4CL<^No+Al; z&q(`hb>g5W`g@)JCLSRah>?sEp1peJ@UQF@K>{Y~`U&Sck0ToU13>;JuYMi>izfWd z6l&mSKZ%P&`}8|o=rxnmE;iA7zn9GqGNn`ygDhWJs{DI+8WIp>;uTZ4Ui>vY%+CsY zdTFqQl$g-cQAiR){{Me30{gjsIM5U`mFtAxVT%MIAh#rc_Ivk*lbUc?(-*1nrgrd} zH9t;#55M;kwouV)DPB|_L*TPJ-2BwqZ| z7}dZZY4gg`_&1q5fgyB@>_vrtC6|_?utz0T*M^O_@#0RgcclyRO9fGaUb_ulzm5mz zR%1A%f*d+4CDsOe_(7f+!L77m3WeBx7&VL5*^^9OUks+w;rALG6LcsRbV}3id1drW zX|(Gt`loOR{1dTPx>@OQp0)AO7G#m3{UcdI~TY7qW`OA}?_s^!wf{!XHxTqA@ZT`KV{dwOC1-k(zzk?J~ ziue+s6vp03q3KszyfF6h^K16oYezfS$O`NSy{S0hJ>{pJ|4m%~xmIdd_-Zq1WzT?& zDVyyrwaap`=vU>LH3HB?qtt-E@#1GO$_gA(WJ%3SW8?)F`d>F9-h<)PzZ}hLg@GvF z_F#zk1Mv?{|Mr1Ats;%M?#BIf6ga_fU*_WLL_oW{eT6aeLW8yiiCwF{_r?$bB8?W1rJs z9PbsA52)7QxDuPXjX!{&8at*se|| z}Pda&sH$wMV?KuA5z$Tj-evP>T5p$9O=^BAiEU3_wDuRd;Mw~ z4FJpbCEd@GmX%G_V}_c3AI^B@=sK<}nBtPiY5cTc6A)G7Lmw}HHL1p10R)P1XE~fI zac8z(5kQIbtgMgk5Hg#EL1|XMRohw*CUcK9d%N~!iGBJO2>L*;gL3hTG-YstG7O89 zR=ZfIc1y3?Mj7tX?p*)h$By&{;WGkcMcq8Z1q*R70$E*R2Y}qc#4BlpoJL;_-+@2&t31AUi>ztz~qpW}??0W_k@_tXJgLX5YBIE+HnPcPtVU#>cuy1EQ?6pl^t zNZ7j1gGST#W7J)4zvlD>08Xew#9=Uj&I|5?$5}UDInAbEV>n}WEo8*y=E~c9?Ty?Z zQ@)byc|~$E30>XuxdwHQof+#fxWM$^xtg%YfIo-%Sm~9MOHd04=ujK9QTvfUaFK=* z-nHlj?GK}ag1O)9eOG%3;F*0S3Ch7Tc{_0qWTd6#JHxPt7!?!QxwD`a zaGWPG!%>*85cI6}IGCw@HpC;MqILq-ku_+nZgXK-(y&uE3kk-MPBsm^o{y?A}NHRVn@rN>l8AN<;Y+@%;GeQz;1qQrZQ>; zNFqk;6-?<2FqEmgwNEGGfc9;|m5VE)pRba^3VMlG9a>PWevT2O#lYA0lb9`QSh*9A z>zFK?@_&a^1wNz!p?aTUnX+@jG;73jwGo^nDfW~|1s&4O4|>QP^Poy1_SIhd7V(Sv zuN5j5bV>~v3RJTMDrKo_+E!Lpo>y9qWJ}bUS?CxzQu!af(I*%2s^Cb5WA6dy>%`v2 zle$B~UFNW&a4cz(<3et>|2sw!;vfM9N=~oCU4sFVPe%Agv?rXHJd~2T+9CQcJi{M4 zz5)phrW%P&E(wKq1OSX#XS)OXum(bC(ZeQ0_PYpDC;ez|`qLk{4)TP?g7y&eT{KZb-_U-V(x#-255?eO0w5_)XDD{RWn79?nC*} z>9~}3ww{7s)0KF9P$ra0IWysIComK89vGt$i+=sDyXC#{mobOYENpM67u@R6Ul!YE^q@6Ce8%5 z<&hZZ7V#KFE88jZ?vsj03iOi?CS>yA2#AW0AM<2gpyI}YX6>K8vo)(N=_7SU3vxl? z-S4d=AMMOepWJi)W%`2LHq`;s87<6Q80zprr<4_HH^wtz3#fA!uxqB3uukRGQcS~- zX_m#!=%gxvIDf0fkDgxSK4+^fI^h2Q6;>rAB7}HdjZdxRrgn@d2D1#8!V# z3nmfT9!o`%e=Q@Dus)EKwwvrFfd)WBX9()KAf-YBP-46&m3Qz|Mx|fiqBHCXYyz5hAFAKtQ!4@* zXy+?s@S~a5hys%>5uf34^6_vzFaW$k4`Ssp07ROfyb=aYfMnp?7t-esf`gz9{d;g) zyX@3X-z&dc1MlP<%mZ4ha@tlJ{ckcnEQlUFH)>o=wzmZ6)ClYrq2kg`*xQjoACgQ1 z8uxKhxGlq8AyA3b|G*?y);J~jkZX?;spk1Ue8s=?q z&QU`Z;D^~~p(5j^x(w46RjuchxX z^ty!G)yUV{ZvcD$WF@8M>14k;E%85Yz+dJG7sgsurpGN=vbJTZe48iIy>Tf;^saHg zxdvyK&lH{{c$VFfLfFC9E-GANC{Li~-1#8Ar?!1`bTl}r zxP4%?YJiis@8!#v2g46{lh*;v-C)DNh}Wha((rL;Ns9Ici@Gh3u_J+9Ut=!^?lt`T zCC-Jvw3lus^0cIS8sZd^^|t&-OD&`=Ux2Vje|W7geupKHpQ3tc4(uI0X~ z#zbRT`rhpGJ8XH&RnGr&ehPq{3?JWpl>mIf$Fs6B?_Cc` zSfE70UsR0BF~42I&HlVwC3BVEo*^t6ADHe|r?={|vmHA!%&B|q)G%t=3d_$paLan1 zM}qy0&+$(MB_$HY|zq(ZZQG;)vSRqjg#KkOux5$ zlDn_PhLQj6KeovLcbT(l2_FN(4qFVkRNi~6q=fZVxXPkk5rJEYi#~i`G5D0^BKJEy zRohAQcMAC5{!bi6sLEGAN_Bj6Jfvinji%k!_`E#&&8gBfaFTu<7EuRf{xJ$^ z;I<1*K+w15CAz&4Dcru&`h};V@xWCX$dQRsqI)`3!g7{LifYcty8}F{u}^`4o8tvB z;)FRUIEYGY(fTe==(GQ+kns1?i%{cR))z-Yls4;%z~gaQlee2HNl{7{Qt#;OwBBFo z&JP<(6Fl`*o-Q}N&tX`X5J|*l^OQ>yT@)~-vHZ`bdVv-E^VFk4d=W#*)GxNv#qTq5 zoS{YXFz{xjs_dT={6D&VWNFJ{0gJF)aj%rN*7NUVa;(JFX>F!#udO_Z8g3Ejxf~ySi9!%z zFTwx&Ys3YBlNg6R(x`~<#Nf2jg+U7JGX<~})g&;hcd%zYaFtEuFtkb3=4-F9mOl;m zUhR%spsvY0fFCk08^avDw!=qqlloy<=wM}zsWMb1tI@U{(k8rSoR$3hX~g2ebwkOR z*|?xAbdkeB-YVNKl#QSgq;z?T=| zM8Rbil)$E|a`1hK4Q#hApij#H7@i#wy=0KCXar18#_y5erJ%D?;W*#U)pkL$)c{Bp zv<5J{hcIzfQrit(`WqJC7;&>-85=TYl*Ch!e>?BgTNq9!=_R(qpxntFqzJYH&}W)d zVr8d(QKeGlkpy_)i~u+1>v&wG@g9U*=nep%-3!uD%%yR>^2LqXvG^mX#_Amx3D=hf zD$G0Az_0l%2H;*eld8!PSH!DVM9tL0Ah}}ZIi|$#v|Ny8NQV1{$GSiHn$92OW>r0% zWM$DRj8)1I*8b=L>S|+PV)E{*t4+bPGR@k98I2dYj-CK#2JVf;tB#+i_(N2fxVaop zMsepJ9X6xSS$3xWb)Jap;J1b1{8+tws+q6ET}e>he9+IHfkJqyXISq5dPNndUecLv z#q9jBi^KDoON3mBFTqnqc>CK1^1Sv}SRQMQT7j&C?-tc&?MUrBp=js~`}_L`ApIxR z{+__|C!&?ZlhhS{;53oF3KB{FCIR}|CgC}5sn$JVg?i|=Jw$4Ye0cQ2%T1O%6olug0??_;QiUajr zN=>BsMZK^j#OoP6DfC%cqwEswmmT`2dh2|_+v+c(Y||#brPK~qvafO!WHDTP@p};v zjr^955t~3x)?mS*a!8wn5RP=@0B6%+N!aHwR2}U%c@iP0HW>k)iHE&(F(l^SfhJ2d z*ny0THx#bNlH5;nKjK~?K|Wg8hq)$CN8Y&tc*3a=Ixnl=TL0UDyi(()DZiZOSc@=; z5$EYlWHg-QW0I%R|6??x8zD6iKM1&L7|aq|6e&5b%lEmjS**k1db_-HLfTi3o5jy* zCELHUmmWQ`T1Ou%frLrS$+!QA+RF@;tbj&{eXidu!aLZDT1B&*(5TnUeQS!}gnIAp z1D~Tp4zspgD}cu06A;jW2OMb}Y9i0IUNQcCD2)judkRRo!;N`gB8wa>Ql2iaytY|s z6!HkJYTGY~@B|Vp8emOkwY=I2?%`oEZEfCS4Pb(4VT7&^GXR2&4+=>jX)$Tip z_j2{Cu@4S4bFNY--x_yon$ON<6^|5aNy_N|)_viE3ln0Ogoyjkgr)YP;q8=RWo3bu z#l3K+<)MJnomr+mmv*TIU}Jb!7tJ;kD$iM%(_&a*BOZ^5L$!4~p`>2^q2b{J*QblS zA7>(n>lm$vgNW#$9|u>|vW`h7mmIS9fSo zjzK1L%>HZj(eLeORD3yD7HLbt7x1ic;GqwSW;pH{wO3^K_R7;O2~9hvK*ngfw#$Bw z6}aj@OvBK6`jXMp#L&cGlI>}WG~?h5-Lv|vszaxL3unLzB}wJA=t9rJR8u9G4zCd< zP9DZ~{}geIj(Zt>FMPu)fx4avh6N2my3^@IrmLxr9DIjUuM9N?N+O~(VYHgs4D9Ki zc$DguIRNpb4t3GPHqi4CJhV;m;vuN2Nnkk`1)MFT(Io?)MB-QLr2}2v(a*_&wRXc8 z%13L!Qs*YlQ%)0502%OB!0|lsF&2+CKxYC?ssuii=L6PqOlm1?Kr!X{XO)aM>hX00 zyj119i?@0ndhlvaaOL-;Wq*?;()%B^lA0Og44;^ug@t9_UdBX({&G!UOd6b8X7kB$zI3>Q4cXV5ILxY48DL9b$ zo=(VD&tw5-p|%gR9m44cG?1$4&5mSl%fXx;i&3|LIB^3=Bq4f|-RVJaNHr<@SXM5t_F36K5fM;^r~+6W zq<;>PVWqD@B*Do+h?z?&WWsKROA;*2pSVR(a3Ah35iP9#I8;p&7)mj=8BVu504da+ z`B{ranoJVp-AE{RlCw+qc;vj%*M2^={gO#^v@?1dKK)Q+0k0a0t5fOt6=Yb`b8-%E zQ@umyVv)%Zc2_c~LF6W;DQPh9nT^_^o-#X8_xhTvG(olF53=NxD$(hda98)p1J$fZ z>6hpb2L+I&mb7#a^I|i_0!=6bubc_K5)li6CxRN%Ept>y6jkuUp(w~SJIz;VaRR7v{gu2@kpDW_f%)ha$hX8|oNqGcavbO6Z~# zg2(w{Ig^a9O4nbZ!>dO<9{4D;);3Ci{`6@{%cHwFw5|jGf?8F>hHqX@jbvq@#bxDS zZL6#29EA1N$e@)j_I(=bVa+vYyyEdYLRAoPHKFZrRam|EqSPh#y~*2Pm#%}*Z_m9u z;KcI-zZIoO|2@pVjT(@vL>pv&SU=|yMe2f!yAMA)^;zGv`klMLZTv@95IfXhcm^Vzzg6SL3P2qNt?UT|@eTBnT*5fU~|$4EORMdEy~d9=Lgm+(2L zK%mTXki@FpAHe|J`Icso@XTmTHMRm{v&i)Wez@#NP$4YkA;|kRUafqc;yux9cYnrh zQd>90!@?J<6UH18KE|Y)$tN>5F5Pz!4*^r_b>%o!odAjn?cQF{10s#Jkg@3VVdwVI zs5pm$OhzCB#W}W~%W`GxVcu`fF5Xc+c2d7jP|#nDXz~tqRdSop=*kHevK;)tc9^`n z#IeDvcr*Ip84fOySGM>X?6C?R>NnBTJG z)J>#I(hiBY%QlYaj0mqYaO4{CrJD~g&pP7Ue@k9ZhbRVRHWog&fl8UC=M$5HjbnPa zAzi{voSeXji+c^VdZhv9EsPqmQZoGO)#;~Oeol|!s>%{>jn=CJ#ayBL#vdn)=-(`q zn~L$4pb=Z33qXaJJkStP?l)YtJ2`h$kT+aPJb^)x`R`5DKGPkoqqzn-xx9#0i#}T% z)tQd;u_j2SH+71vhMA1hbu4xQ`gQvaCG#E{I0l(CncLh01n>Phbt$tf5GIO9Tt|gs%dC*m9yq z96|b{U<-K=)GYW)$=Z;m7IKCZ(!k|^;iiu3cC=&8Dk{{%-+Fyw2=0G@<^T4B<| z13Cv9%8ws5PtIP4H$Dl!7j~8Xnsv@=*2HUK_gnR6ukp@R6sgDwCsSmp`6cdE+}tNC zxRLB35I_2nJOzU4_c%$T7JGEzkx@Y()Lv;n=+XKVVIG{~v=s1t_~}HUdBQrU@kjYD znUChJxvTfv{Bai8_YB&($cZ$T+JhUvf>7GC1Up+bUFp^y6!I`3ro99hS+>e6ta=IN zL3HururtO6ak1;8@gyvQF_9~JL{*6FO@;AeqE|IC@s9c3{pE+aUi}VZv;0bTm7wG& znRskF&5Jjt1a^8Y6*X1OnpfruwqFMJ)VJ0H4^ProJ|soO4HJi^{4T^s#fVni-ETa# zmu-7F8A~$YyZ+IGVI7nQyi!N`OHYD}3Bi!PY(j1s>L-3@%i$iF%kR0ZdZBd5OReIM zx{hINlqIjJ#1`%Ro55N?-eF{nKSnJl+*y1m6CWdX^VO<- zkE*~dXLgh6;%4t6XdZBXED(acO=J@+!C>!k6D;lvU_K3b?AdtUVU*XZ?9q3Oe}L== zd^m)dv^a5D47csCqh04g&&W8RKq<_Ug-LQyX=PV~Q5Z;{azoxs5Na|PJK0N7x|h1@ z%As>MEK#3ed^aiwr3E9Os}+LmbZgFJr}2;rLl8nV)RVA@wibcMWYpSU;2uj+Ld-NS z*Wy)`(g~QS_5|L&5!Znh>ak{h8|oio(%8ls$CKh?A&dXwm6l$;^2VyG$H0V(H90%| zp3Wd)EyLDph4+}p;xiW%#97RbLfB{>h`ZdGEPGnsD=(S&imzXyeQXdMy?LZ#_C$0g zy=^a+kH7QL%Y|8y%kAnUD|ep|v-g5KFU*iopA;k=X}`K1w=oH$^wgwot)ZF@_67@ z>;cH?O>R?jS_~nhdIhNh#or@^-(o(V4U_4<_mb|ztD5T8PLR;1Ad5nfK;S974ox#4 z4qG>yS3G5Gs+anB?F3NIv4yK>U4FuLFllX6XhIes1&4 z6^qxvHvPi{ceHW+u^zhchoX#-L=88@mYp|WtEdtn3S(D)N1|^9&@s!7! z*a}P4{J1mmXfdVZ!THqy&NrJ?cck{N$#{l`TRr~aqA6YmjTd3XuMVaZt!$GBax9MIOO8X&7XOx>>tl-XMACFNt$+j(!uzN63a0B=SqPA}Q&jVQ8 zO&>1{07XueKs;L;tNVd#&t;K3cNuj2lGclW(6Hf$IXPXAm`#C%M9&C07P6!?pfao& z5W9HPL)3CCFvePM{bQO1ozu$d#OONW!+XAAh_XMg6w#abr&wNcivMc^P3XVSk~RXw z414@c3OnzxVsV)LocAlUAEJXSL50jKgc|AYqszQ1j@YG|Ac#YnzxtDAcg|#Mt<0}f z6?i<`=GO)uJ~L4xNL;pRhJN8pSd8sK649d-XpA%=j-B86*w@`O4xnikFAnA36()SabL+{BYSc7!>EUQ6>{5~X1}~*I z&i*B-;6lf;K*$V0xcJNZmu1x@#CDqx-uu*MNrQCxMwOgErf11CNR3$-%_Z$=)! zI|RiI*1&=jq-59A1yLBkvmt;$VKZRT!b+I-KsAy9xTm4@X^&4q@#XNvJ(rko3T9hV zr8bOo?%xK7KUz(;atuk|F@cKRn`VA)5oI=&T5OOQi#V+v`>>paXsNU2s^=P3aP^gama>oms(F}3e zyUPgQ2qAi(tZWlT5>e9}X$+2@o30A7fqkp$TI$?_< zlS*}7-PnY(TD?bkmP51j3;^)7bRBE`Hr(IW*}~7d4in`D{#7h6Np$_IE3*U-TYhBu z@plif;{Lo~G6>b*fr4Usak9e(*Drv#KT8xU1ebR&n@~lM=~JocY*I(Pn2XlsOt7Aj ziuyH}O_2@OVT{Vy?)n{#%As_+!V4m`Ln)j0mWo_AhJ#~~xj0N)9%%lsY8RQanUXdnP#cS%YBD<+EN;Djj`E)NZaj*)0@=!}-EPZLW zgZYqlPfS9U&D2)7mD6uF?XI7lrj$k@Uh3DFQJ$~nPT^A~8loEJbd`L<{pQ}Xm3qK) z^!lj_&rL}U370MaW#dVwpvkmT0u}m-xOeg?ISsfZS@kuqK^Ao3xvOeL*gD^q;Phb5 z_&aBA@BLq?8Xb4YeHS#0)uuom7W=++z2AuvohInqQJx@{*iDPs{dX+@1~Cc!3-5;# zBh$K{9pWlAErxtH3__)0FW5fHuY7>T0y7<|rpy z^WLD6^KASaQ9h?n}&6+56MsFsJ*1tf*5i#c(U{1BVtDmR{te+}KdSK`iE7@Bt zUQ9x1d}*2K{Y+1NUUu{6CF8*oe9lvXrRxFHiAhY3@)}JSg{j zrNjy#ZUl;FpAw7%=4oU+bmMn?BUA_$BYM%>#V5hbtyKU73!CVt%mThi{B!x!6`!oT z^H*irSu{n;m$ZQ=L6`h|k?j8M>ACD3#&=$O-x&X~FnS?KM}I?a)HN7FS(MrD+xH-d zux5Bgxy#dCK{Pq!PJxfl0(vP+I&`2q=!^vfUlcO;=3Lq$wOvGhJ@2iXUc}BVQ0R$3 zU3I6mZ5Ufdn%rW<0_UPOXa6 zmslG4c^~Dz!FOm|!W#-?KHi1HoCd;p;z*keBfyR8(`{Dqspd6}AYeAx z9fmrZpYY>SG9K~tiQdfd{-sD*g(3jqFO;$#fYZHiAnBY*2v97CqzByj3a>>TTz`Gi z!?#bTboWpADmE8g=rU4&QgyD%IttXdlK|@L{o}1^T#^O2M(@AQR^F^ZOGk&CZ?d2T z3S%yeSeU@C5M2@p#FAaWPCFxErkKD?1L_!Aj@&q-EYq01{U>xzt%(6Nel|D|a3{A#l+;t8-4L|t+zM6EV^Yz zPW-{Oa18wK*Pgr?D1vBX3~Z}M-}`Y z;BFm+UuvI4vV@^w-?RFkkJ%&81J(4>filJgX#zk8;yCp&9|rv{vFVZ;-~Nkxzt8`W(kwP7eNJ=9oy;_YN9kM4sX}&6pso@~rHvH_MZ!qW_#rWDX}{+g z{IfwKH{jpckx{W?L2)v90A`@%4iIm=QC z4m3r;-Jv+|&V3saTzo!NOcEjjqFDTPAeQzP9NK;Jo{Jp76X(4YAB=<7XJdW}HVu4Y zV%sUfCnIepo}i#!<7R%E5+F*BktCyW@Ilt&s)*nr>?Z#)sGzo-cZQ+%7YjnKag zXSgo@8HyA+gg}#Ih61@5O8vWM?q)HO_-_T3zF)cOsdQz$J$V%cm*>{4uJnQA8Zl_> zEd4rD=_Ej6;IAt8f?Gh6m7C#y}dmvs;+Xo z=8m`B$~;pPmC3U!?a(NH3`&V!q?ar@_s^V~f2-gfwhx%VyOg=1)43Mdjf~H_ z`N|fqYE)9}@U_fbeG?>uB(Nsxx@_Qoa}`pzx%Ka@Q9k@pAn4sbzFA;4cn<$J!4t27CM;bnx<0Nah`rD(ONrU+*_R+FMfI0lrouSi=KA{$6jr z{1p#pQ{CKZ`ofD|(*(h#M|762vRe}k&X#mX)Y#6tT%EWF{P2RL&+dbHHwL62O{z7p zytyU&K^zmMdKVI`&W912+a?UBLc|%NZ3GyFC;E19L-OXz1ytXcdGB+66~}ayBei;* zloa+Uq@=`t>l~$vZ?3oZM(JtUlRy!9DY_@g zM8pea_VHItH!HSe@NNa$+P$}<1lW1C475Vn=TzIq`X-l8tNnQW>swe^dFuTREi{Yt zD}yrfF|V86Cecup)OKa&rqY)$m?}CvF`c<7!}J$Udi$raF(aC})JO0*>2?K;Bc9n+#uR!1!xBJyTi&QIG^t|BK{h zp}n78V9Sd>MTZb#HF6pYH;M({mOS!}yOg{={}Px7SdN3s-A`nW2~+`R5vCs6fb@&mA7B1IZx+_Za|v3kXjONIcLtFb=ZW**kL?)NFk7o><^KTKpqDTq$I$ ztOw^eOd6{6E^fsr=H7deXMWtFQ6??wz4h)R&(b6jiwCg{1_x9LKW?~h#-7OF3w_N z4!p#mkSo_DszhTne5ITJv@IaneH+PesF1~J&G1nzG@3-u@T2HQ`4#*x3fPJejdrWg ztzIcdpG9BxT0Ww%vqmvyqhOe><*>$Jx_tHdrJ%Pu9>`>5b!?GH4&IpwoN(!-%GQGy{3;@%mPlom%9$!l24uH zw2*$4Op6C$V!sc}e5$VmMl^#FaS`x;6 zL0-w;mT6tS9EHuCH}HI{z6z@y^+wyS$Fo~2Jtyzi)-Ru>xl?eoTW?S36`64M58ocn z`SVQ7)X#7Z(0$ieG<*5-b;8!nTfPPM$$myp@z{;=EYJSa>eJNiz)}&~Rg)(ciCelH zR?-B&tGQI?VmkgMEZPqbyBn#9A-a9#swM;#TCss%T-q}Wp^y{7OZ!umILbqE+?2=l zFGfG;6tYAbe9PFy<-Ju}ufZWMgX3i%O(iDU9Y`dNh0e%@PR4~!EibhldvUy##(bOG}YND7}_+_%LB=T*qQMtT?;VFeKH z`vAAT4yv5Hl;K2J0}j%eL=WS)S;2usV6&s(Oc&U5&*5-=%~+OCLUEa9nNS&;WH~}s zyyiC6?M3chdPe)S{MGDpOh$RB=W!TQ73s6(oJ{=eh$#EwKXqD?Z#${#XM-^gZlAss z$}TBX0=PBD6ZyA-LEF0rbZ~?w9<2a5ujq!5Tt7q#LSFCP91^7 z%Iv#KlWV=J&z!sHpW!0rAD3@GmTw;(L>oK(9Oe;M6sDx@fwuS$qGT2rL2H zEwj@0KC_%n&}zP{w^NVH+YV{efjU4O@g1ckK@=DrQ__KeK3ygmZw8Y{QHe!Mw#WGo zR~X6xGrnByG@T>RtKUg>jff@nEQ>v4^q2xAq0Qz$VC3|mCYhO&RqoI@KIWhcnrMRJ zhi(lBQr336kv)@6dzXtZqTp0j#}?BUh5XDn!ILWC{##w#H*P1>=f3Md6W_mgf*HB& zDr+BCZY*QL;_tCnXbNZUn8OV6+Afo7x>Ch);0UO z7Gd|F;Plwfn%{na&MSZ61N00D3tiJ3j4r%28Hrh_o{9glwtl0_dq2RTryh1SeHs+P zp*UycwfQ3Y$71G*7T->R;Jyq#*O=O-QPS(YBed}SE>692n@465MsWbSJVN8yIpsrkE}NWR09rV4pR+1Z`tQJbQ~Z zRcZNIi$C(riEJz51L{?w)qJr(k@Bz9rshxUbM6hlkw5y_eFzRkMjmTa^ z=;w(0c#s7$u2X(5Sle7qL?)LiV134oOFn4k$>Yt-A?edU4mbAMeE-p1?EGivPKrgK zpBKpXvEE%cdy6-JTp2S$TXH4@*fnE2G%*`-J`AGVC-|(M!K=R=P;@4>p3rK&*(%=s zu}>$d^A3?J%$m!=_2DgHrN;>EH(`aVRio?#&r9PJzW=n>n`ubXZ%0!wFi$hySv)@A z*}PbjjQQL)oj8E(SekIkWIuYKXNTskS>d*yk?%L_`@Pq4_y19K9^h2J|Nk!?93u{r z%@K~hBkI^YGs@muHpw1`VW8Khf zt69CR=fFUjU{s20Rp@<;`c*|VO@)d6IDONH?fU?W^vu)9s$HiR-&}uUaNs?CC-BB2 z_??pVE1RaPTR;nB`W}0>xBRSD{X(lii^$pl)gsT*oWj12y>8K@qwc$7pK|@14#jw} zQ}*$l=DGCt6V==no*o8=@BH@X+74HqlvAq}$LvK8&072z87TkLdg|Wuy4s-Uz|)|2 zXvW@_M#d>0A*f);G5Xz(TMmEubzA3P&+De+o*Pp@OhoEx>A z^!W*X&*e9*nQkS4&O+lG2F}tBF&WXGz=?jpI`yLGR_~{7o44yaqme5ab$8|jEq{(% z1p6_UFgASbK1@zZ0)uU!_4@SdM*Ulz;jmBj@Q$rn1{2{XcEK87a`vZaGF?zSvN&&_ zRF8hcU6k?0V#ho_TA0Tt$-W&|C<33)%XOZT*vn;-_}!6A)=yaJ_@LNvy3YrU@$z@} zEn@~3qbg3U$;4|KtohGfZZ-`_F|!*@a~b$EN--=>T9_|hitjR(gD8SN5f-#HTw@Ky ze8vR8V~M$4u7_*+b;-UHg9?Ko0Jy2im*;B|pF?r+d$NCmD@_jobLS~*Y{-fKwHKmk zQNZ16(38Z?2^~ZQp^hmEw{iApdWmP4^Skk=$8SKCAqK;ASYiIL)B=_f&A(t+s=(q4 z5q$)1#5r6J(mJ$#Y4_9zPf1nhNg^4rXF2yKkHyZjf>>hAV81~@Z}zS8sRu4`N8odr z5uIpa_c}1syJ!Zr{^5tf1Bg6q8x)CTNvXw9nOR;dZ!sagdV|tM0e}$-zy0c+Pp-vkO)(TtC&t1Q(h1U zZk2XEW7y$8y7V@|vv@?j%Kf|^8!Q@1ZpF}zn~yo@x?{pDXk1p4^-cM#UCkr{)pYO)!HrP=-!S9Bf$i1 zl4s*@otX4pmH6>8r$?v{O7BxcS>BS!@X>o!_Zj+0f`$X*Fo#&OFxx-1im7P<_%u5t zuO*z2+P71V-d@^f(2DqUPyuKHy|6L6BlT^aL2|Cd==}$M0v+#Jos)}gwpE_!J;t}p zd?D`0s%Uw(QSQ1-jrACofwNMr_y9J^{v8O`sci z1cn(@ygZ`^CNn=UPviIXv5S3c)=-oFl4&Bs$ZarWB-fbnWGI3-)Fr=)XSj+8DGAdI zabfWK)ST(F!S2z}fottmQnS=aVtvCE*PIwB?+^%J^TWh-Alx@m+ss1Kv~A@2g6-#U+2xl&8|E(_>u(pqLDgxBnFOR*NS}!-fB&3NxRZQn=eSj?FP|`r+v9K3^ zHRM))8yoBNodj2aLow1!o?WEtFJ;xVmLeJT1XD6A-@k$hY2`in%uBC^<^DvYk4e3} z!>cDLvE(itlqA@(%MLBmZNa#Oy4feh_~9`uP^vkPHJZ{J#P6*ri{lJA%C`7L)#C-Y z{2h`dGktyoUD8PSJc+}iX4=UcJ ztS}EH@VtX$t}IuUoc40@jMX061w1h~yD+{Rv9R-0>rKhLQe`QCs{i(_ew&FEkI0?1)Ybp$YLZy&n0&MVQ_|mx zem6l<#o>IU4r%~Q9&BXI0+GUc8P%*2I_J%W;3qhKD-8J>*}>ydGdHR44keQNh5oT+oFI$eXjy`ULHr^t zKKwn5z?O5O03sEE>AF{D{yXuKjReXoIQDMbDn&2D{H=Ry?1o#O#{UGdd($Q!0W*B| z=Xfm;V`^QYED&v0^NwJL-8S7$#{{UqYEX{70pr@V0)WMT|KenNOTl@D5iDrZ~yGbypQY!IU#ZUIkCKfbI; zxFPq|P=>4b<`Pf?_F~DDF|S-wmXu8ksjjgWio9bp@P<`imkfHedFPgyF5``Naz|Tu z4B>GqUwxjL<|}aR5isV0Q0K@egL@f0Ad*nzEj|Av7CUIb!X%c?7mahfQ8 z+pHFIf3l?~y&DGkG+qckeG=a}r=9v=YM(qo48)Pfq7D;p&ScLEL6`6`=QaSX%fm2l zwA4c3C&L7|SS>T+EYSXEuYucPsX)^1(ime}S*QqBjp~PtO=PL^NJ6zC#-ItgoHTOP z`iA@PEDR8OAG`^mCNLPIVNuWYawW;6ee~#2YT+TM!L`*%xk#eZ5rAxv2+DMCQ<+nY zbqh$Pm_oln-`IPF+=zZM?7MCYHlxJ2Z@aHb)uY1ayTFJ{AWOr=5-t2Gd%(!^TW@jAScDYEWe9UGxODUJY*XT`qAFPN~m`f9h zIb`A7+UtB-Z{6klcelmYHs-y3jJ?J(C`nYksO3?Fa<1vcuABVLUh*0?0*y}^4+R{q;4*EasE$@O#qZIWz;%sQj< zXxW{^+;wPQhldrva{*JF@0@c=@_8RG{2uwoY_B9k#plw1iJv<&BNPHK`(sK1(PL!Je$VYIz`fVMt5@EwN%L*v{uZ0X^yE8*Rupo}=-@ex6 z8iOwPW*GI{8f5Y)xD{XZ@EC9mf7nIEr^SjRlB5mL1<$z~IScU_45E!P}?{X3CK_+X^=R08yb4 zF+E55gI9aYlUKtIA!^{WY9EY5{UWHOo|18t{P&xI-4xWFx36=tAc0Nzji|eIl#Ca! zZ07=>$394ue9#pZBrxq>2Epndq`m8h3t~*ayHvuq(ymq)d=nEb_t+7$QTt!3X#P$~ zz#pIWq3jS4Mbjv=Q9orG3C1Rdxxx8W!B-1hnDuN6!P4D3(6pmjHTSJEp1q#Tit zXj-IO;F@`>bIe7dSH%GYk|HhZObN&GI{4N#@98zE=WREy|A{ zzi!4pi&y4RDzg2@ga$HmC-36mSZyrXXMO}$78%Z2JW;LIur_iGXtNLI2 zs|!9yZvAhigYPNuy7|FQ{rg}gjCBIY0_s&efbP+-t!_I2GMEXDy~YRCv!E1pNVKd5 zZJ>No_rY}01k1WKVMomth?z3R1W{(_MzQb3 zS@ig?z=>cf0m(q`g-{A+&`#S2@RX-DFMZ2k9CHC@j&@%^Cf)+m+`wQ;UNAB*5d;D4 zOF=fO?gc@6D2B-picelayYURDM#2u0^N)z;sFRJJzq!S(+0O2XNfv^Cli>!V?1-j% zZ_LM*%xfafu3wRN@MBm8Mh`gy$8Zi1P4+P%RM7s*1tG@!9bl2*%R-o|z8r$s*?=8| z|2(bioj&LDlRdY7p$X^)$p7hMU@b6XQ_zYUE8G2u$^d=rwiFG9h5LLa{y(sPL9#7D z1~rwh+Oj+TemE%6H$`_$OkQc>{5;y4(kf6-y{klh&#BM@pMr<$&+o$oK--!8$77RN zAr>F`w*Uj62E3GeTz_M9lGh-8m*uY83fG?Kz5e_BT_Q?%iobzI!tY}T?N+77U%1$5 zbpR}IZReYxYit2X2pVQfho-wDu7A`TL~pfInaLDW#YWe-t@Mbl48=SOm07FL4WNFm6Xh}YEeQBh;)b?GWc%sb6*1b!2Pv)DlpdZmwty1 zLyZe~<7+_CAaXdY05X{AOQ8K{vN`hsqt{g`^ETXd0vChNvHaBho%t5~`IC#T7cS56 z-N67mVCPY6X8sx@m>K!I%PhHf;klv1?HpE#f!J9JgDF z3mv&QUkorDcF>8qNcLh<0iu_{c(5T_J{>cDkDkpHp%9TNbNSZ)=*{*L-W}*P2bhyM zTVXev{>y&6FjA17Vqgy}tZgVhOdFhw6ybZSnMl3fEUsz(Og6O)@EAStfR7`=Rou0s zT(($-OUirc5t&V^`C@eS!&$Jxm%zRK;~LmY{lz*&4MR>hZmg-A8-2R*Y8@yx?mcPs z{66%pq!3zrO5ESVwbz~zpdiOk6An=L<*cdL#&OfH5i5M)^Tg~Z@CZoZeDq~G*pPEC zcRr_edY`$x?Ytahb1u%N)Bgc#e$wB^5lkQg2<68*dO6@Ehnz{dyKi=mV4Woj(t7HEiAJ($pz18!s6%JgSlJ7!4rgIN${N@kNPAn|QS zQ?7d~B`%B^)q9n0n>-tq!_45kzgB8d|KwTWWM_>Th+2=$vI1Q>g3f0^oaJjLI7sv4 z;@-BSXC@a1#-ZGwW77r3xh80w>mO}592@)USL;^dYL}YJeOqK_iBXf<1_x-IWE`HM z|KZo*W7u*nNLk2Y#TZFZYK-5tC>d=fTdnP6Qm9X|jLE||Rj2un#@)tHEXjwx6+k8kN4 zW5*2{mC2Ww5Fj>1gK6e2>Nju+jL10icP5q2U@9w62Hf@jRQx0m9s_sfM8gtx+O^3z z)d=a4x^Q&OPY@8FMhq}Ua`z>5eiVfT&L%x(&~2nz6pzzr4^%2&}7<6>v$fHplOI zProtR8Una|yEW>xEdP2H04om9)GF1e?s-0i5~kMIugatZX0%@Zu<3A|?}JHS$H18t{s7RYpXd8YKvtGbwm)opO} z(3#7TkcltzY;Rf2l>j^5#yIoXS8na`Jt60rSYN_!*RhV1D~r;)%C$=qvqb*tPUuCT zXWH2b(CSC_8!z6vPb>u30?;I>{&C3r|D20m*RWtJ^SNf)MM`5dMJ;e2FPbElj^a>6 zZT?u1kpb__&?}J1($EGni7kT!`e>>gp?(b8?MgJAFduehygrn;?^ zt#Q@hYNnoS-QgX<%$FW=ioY^d=UFVN5-bIGGti{{X7#tHwyt!I-W|B5o~XAi|3v_lD^V`uDh|DkS+5=vBGmt_S3-D+^LO!|xzE-|B7MpkImZoJZ#PpMV|;7GBp=PS z1Bd;~>x-Qj<(tb}xo&PjiK>cgF~z1OnYbI zZf}w-6)h=WlbRd`VbtW(~&8gZK}t=pi4HjeV4eRk4ga zKZ%K&;WRC_Hh;|+l~qq=HKF|}^miL#g2JB-7ZR?%T5CVD|CK!d*)!rjcblUvX4{=U z4uAh&&Rh`}Yy1Y*0>_;7(rCXxt$7ldOadZy<02jrP(DxTF@Ww`7cI9eL8yKWAy9|i zm;(VuG3k>5;A;AooT0&x-d>!z>r`5+-nZYPMu;rB1dOfU_K2eef3t)<&~oi_Yaa06 zG|$Q;*RPjL0GzysU=ZyX!Ai`7Z5QNy5aG*T)0hwl2`A#p7sr^*pgX83bjhSb13=W< zvXLU~wkpAg5~-JnE0-Q{INzE;uDX@-XCdMvENFfuMEq(qjU$27|4QczRFc;i3=)rC z{1+(-4GKCfrm`E(`2UNSxA9b^%mbiQ5yX;v_YPqbX)N;q@u+}@$pLbc+g@7#!1?lU z!o+jq4Y|u=^knwhujOvC39yaSfZ16>13b76`>D2i2`5i_Q%W*L5|)Jq&|gKAPoHm$ zb=&O#oQT^+U`ttt?!{z#)21j>j;-iY_Pb8`ESHU1&uB>!n|keZ0m=jt)^zujW5Bey zHPIA!DZ=zyd)^@-)qD8Q(WGOkcH(;By8NE03;R15A+);RG-0NK5omyR=Q5_%H zzIC+bLcNxFV!7otNrU-YH1fKim#2huKr<~I&+%+OVl z@{O0js$4MsZchrD+3@BrFVac&!dZAt}5(yC$&L}qzlaXPZA&)l+*PFbg@YdD1PyaD$~+e zZGqHD#UC&nXf|_yG6{e^iU(8pWd_gMhS4|e=*pjJ(02TfLWUY6?Fx%p?T`K|+0L_G zuSu+XdHM3QoeLnTgi8WFzLa)5@6r1Me%qle*&7mW%Mpb}b=uK3=%OEs>Rk6*{WWke zc}?G#>?C@_wnH|%4pA<&uchkicx42pfc9q>0W+*uPUKiA&@Eeoky=3LKH1CS-fWRN zA=HLWmzafAy|k#)RMULPsDkTlsxcn6K3TRcUcVL$qwk|oa(uOe%@tzC?%MuQ4$QTU zL@a(0|KHU}6A)jNo_3eggCjPHB`%fMM(_4*Ry8atS-4POvZCK~k%Cv1^;{bQ-mKtF<444 zp7MRcn51OnGi44uhn#+2fC|`H4_OgoPm!Jew&HhH*e&4=jWRF2DOUiR9v^F5s!2tg zUk|M~SW&343nMVf^O4T$77osBa#!hq*6T#D1!*YmeGs!P0&onP|L7c&J%A;g(-*;l z`I^lkl24n`z3;XHcQGo9bpE^5z6b`EWBrw0%J??A_!gT6{1m_birI@PpRyos@i+c= zmSSoTOEO8)DRA`bgPHdVEzD)kLN!^x5C(sQokC7tJYW89F~D~Y-Q5v5 z-=rKW$f~zpJ4-oeI3d`y>$uCKc+L z_gO)gC}D5({R(>OgB$_{TDyvG?K!%dof27^rieXX95nU1G)d6@Wq*U`loK333qETi zPT;LkVS;s=3<%3-!~xNV@sY2(6}8Fp^il{%(f}tDy*--{N0_JiP{p~CIW`@VL%Wa- zh~J8PI9XdjBB?-Xe^dH|J?;GT%uMXR*Eog|-+i7J0-tM~9A3^!uO1syTHRt6@~$6w zgR=A~B6oCh-eZ5e?J!rhZE<}}=J&m{q^4fL1~d}CJa@;SIG4e$_t3QjI4P0=jOa0j z2z-_tqDLd<#TQWR#~Ti_j|SVk=$I&{$KP9^6$qji+>$QP@yMd8F{rBZ$jqHq+6J8 zUTdr8Xf1KMWIZ=2FF6)b>i?7i3~t(($?zs$9Z!L-{QHS`zH}8`;F)Ss2Jq`D6&cn- z-YIET*^MGF)S&}6%(W~PL%_uloZfx~2K(X(cv(&Vc#C%w!bxEEFA5^!F2EMu`5EIT zj#)S``1NHzDMg}R$q*<{Yt+)kk_Vr+?u&>n*TtQmq9DH-wwmP!{Nq}MU~m&Z$_9sv z+gu^*58@s`Nvy70q48N(tb(z$b5OA0@Jeig+drIujXM|6&oG?R$f&4zVAq!c|0^SA z@Hm%2mCDBWC;I}COm;i+m~@=ls~VI<+VtCYIlZ9X=KUfq;fqbrp9BW6!APhcc%jlq z<1VI{7aGS8i91LXRS^jD$U6?6xNb}=j~5F(!VaIV1XXz}6R{f`M{t zm6U5?mL>EimaaBs!Saw|GimeY!ZuH0S)W_`cyH@ZD^eO7PH+-eDI%*Hdwg%zxWpOK zuq_s3yxut3z+Ee(906#eoTwk(1_Rth%^}Ocs2iNlmu%=3so=kS7eh=0u2g+suir9s zpKx_|$3K~DnT3fjXZUs~Jj%pSzfmRdYo!orCAT|~5@e<31Xa7T`A-lZG$i-J4LcIM zGhK~J|DwcE8ce&J!TyB3c$I1K-Ahsc)fIXs;C+i~qJf-ro_`N0-AdrTDAbOPrxp$r zo0GB~VS_ntjhB(`xhbxGciqn?vpn>2iQR#Obu4v z9f{Z_LQP(GkNp;P4=@Fi0H|!!<9Kwce9vihx3JWDbL`_2b=GwsYk0X2bubvz9jq6aBVpRzq&uB5O;#5a{Y_c8E_hJ#OwwGsChTTJH>eh6l zqs--95; zcmrY^co0hTIo#xLmb>-kwf1<^7tG!KL0-YQt)4%;<7x8`n&j0J4s819uQm!xyuv9T zF3ZjilodUyaR2jUm17z{4g6-yGC=;k2AfxJkx|`pBx~jnO^_iy)C9%I&Oy&tZHpmw zDNYm_J>ioY83ULaeC-b{Pla|1gF3$?sEg+x?%uKmKMTWyaSImF>o>kS)M7Y+z?`0u zwB>#5#|Nk{A9@mXjPsGOe*{gNgDf{iD0sJ2-ijMxlcAR+?@j|QGN9jTV0uF9z`Wt< zN`J)??;;KA#6?O7X!5zXWvxY~$**o{-R-8`-nt5yNe4FrQC-HImR8o5q>H1NfRi#v zT=u$GlCG2)(6M~9K*}H+3rbWNE(h+V*qLPG8sL;tD>>h8k0i?lN}LbSz#i@|zTn-s zz|cxxJix+HI~gt38t#nAka5cv&9m;U{X$YWf5Qy-XhfM8Ll%a(%h)~w6lN6;s^aje+gR*pOg_Qeg_8xzIF6Mrv^*=pD3SYX`JEqp{_Tb2VE z;It8)o#MQzKq07<$zP28L_~Kgpq7afq2yAq>3xv3^_@kVe~H4A)-zRzp&kB2PCsjY zJ!vHrj`ir8)K! z{OB7j?%T7XwHqG`mN7ClumC}ks)bYcb*Cw74dMl820}`*0H|3g-yC`6tYlPbUghcL z+KJD}(hyOKQu&?v!8X3pZwmC+uG$1~k`bgOcN9TOFBkB<=-(qjH(L@JG-dfgd&|Yw zUJ~?fUt?#B%!NHS1ueZHKlVxD3X`!tE9fs#4~`3pA>iLR<#@LSpPaO;*rhGCUK<7B z4!x#;1`IaU4XF8>R?83V0DRbTaQpPpun}&-0<#s+iOHd?u;+0z%Rm$x&rMC>82oj0 z`fxztDLD%96Q=!xVVkDu7150Ir)nwu{$&*ZJK#Yr^BC(ypvkhElCET<3#EA=a-Ys{S>hLy{$9wx)7) zI^Fj7#VS|YSZR6){w%d&>rZeZ-inVZ#;QJpyo8)1f;4XVp~9=tGw$ued``0>-x`38 zpT%Y%9W(5noV4USV1O8?R5?t&JJaRvO!mJxUGte<-Gf$`m^e_ue7y0m=(2lS0g65aw(Ay}`L- zj>Ld+QE~8V*B9%o_(>_`FX8(`BX5Dy^S?bO2r{BL{|3Y|B>vcUzog@d@1;(TBT`j< z`oi;B5!Iw@7Hg@*}_ZOBw?H^o@Z|pHB6L&3G*jQSO z;Z1#h;dDvjwK?keP`kc)gU2aN$Ucco6YEtrh|u^!HEe^rYx&`Q0iA@ z5FW%BZEF7eW@qP9rgXo$yHfK55I>UdRs#`l8CgL`6jRfUL!xf?v7hd@7n=*Xb`W_V zZ*mX#AyO=m=anuM zW-ua`eh_#3`iz3J1#XE+&bl3c^XwHno@K&>Q2-Cpc9N*bjLIYep&Lfo{aE28wN;vH zeDm*_VlDp23~*qY0AubwOdk>PUM}KhM-aIyhT!*=|8rUiRzxu$4O9lE85etXv;q(7 z(Lju28pBx>E}z`;AbzW-7XRc_vK(`p>~FPr!Xu1yvpWV7A*X z;=$D$BL!*X556Jo);iTVTyp1CT3UO!W7sc4Y1b`KMeY^3s9^q;B-X?EZ5j@x#25Py z?xs$#bHpAMA*;{dw&N1M-9+P0Nv;2UcfuCp8N# z-#C>ofOFyZx&P|KzKEU0be?f`5b|=^N7R3{etF=HYU&u5`7}4;uz);1q=U>lVGn(J z5o_c&|AQ)4+$+u-fF>Kse4zD**3A0d^D^r!@otz+v+k;V$on;@_HX&z*m6S zrG6{8!eL6laXQ&Hm;h!9Pn|^9h}dkJeovMv1d-VEFkVKJ=m)oOm;6AYroCv+A~Yvk zCLE!k-KjJ4cVAqWx)zK_|H?E47i=B12~&N=@CELITw;oVrsP)vg52y81I0f4lEF=)qIJ z!x^GC!p_Bg2L8GiJKOprOFeF7?w9Pf`=~%+C8HVEw#TTeT&` zj5aE_ehJ!R@;r?v<|x~Ey)dg7#yPkvk8V(MgTn4sKCqQ%hVz98Tv?&I2(haDeE0-E zt5%CMrXFNfD-E6Q8*%(loPu8+(vl_6=nQI>CoezXNk27JzDN5?$RMV>(X#a^J?MgI z^SvoQ59+(O7~zvs!ItaTOn@D%q~DK^c;svTOl9}E_b-%Bj476xbxUfSX03|&y1%M7 zyS7Mmy>Ly;Gas+UbP<1^Pv6P|;-JlfSj2%hPm#O3Rxm?NQX_{3$+ODC_?bJMk^LF* zhPBZ(II(m{zbU{#7y8f*+=Nms?G*4#puBd$uo9Be$M|{(*SDluD<9GvKe5<6&TMi) z)X?aTIw`EkDfsqUoB5|5Z_Lg~inPjl=52nk)DOJ}C4K2JWan@A^so z#Ww{mx)n5`En>xbVWMUC_ z&8;3|xLI+_sx>6Dq5K~Zj8$;5)saVBRE^el(2;gMg~Vl50k^3eNLg4_=DRn7N1su4 ze)AboZ@wcWzUXk#z3eN;%6hH?f8lou)=eHLn?xbe05L~GUp-r@ZfnaF@R$%ORQ;v> z!tOvg(n_c&qf`~d}VO6pk*(|Vd1MWGEZkra7T z&*8Y({V~G^!nvU#;+mi~BQBRvv1j3URSkil{x#G)r?1i@RgYZxrz3P7b23E39HYcv zS+_ubrbq0ZYiP|Rb}oNGRX%F}kjR@jvF4rmAELC263c{Yn0a#iQ-Ip~^OkbvN=F%y z7K^edRvWFEXvtf@o!9Q<7Y#;b0$Iyzf1c#;T1+8J5f3bmLRhKj4i@iJCH?eOdzdS5 z3d*l0?r8NN)LVA9yOMX4KAZUl+eCUwJUISR+`8nS+4*;~1;4MW&lh~@vLc-g_>3#X z=NhZ{g+j1nM<-WJ1aJRfR;D-p4Dao3+46y?-QS;4ktxU!F%EyNTMAigd{e76msat` z_<2>Wdk3qvyIPL9P5GW_>GO%+UQ%ve!+&(0T6)C8`$*gWGZ~_y8B2Q5ADat3@mh6) z?bJVLPcFE(n> zpzOJGJ+UGkQy8sa!S+}71S5TI#gnZCG3^NcC7+d7$$=6~_BRY&ZQkwWd9V9~{qPc0 z?dO+E7}KzzyLzr}U|n9E75jm+d8EPDv-V{1MVo269m5^XB60yN>_VmfJHJJo&~e`> zc9K^;qI$MNK3J{3`vd&V;bQzn)(C&y#Zf5-Q0YNug~1 zrywp&^PW%_yGQ&U-y4PutyLqKvctVlo*bbU107tF~{23*bp87bEImIRy)Q+-I zJ!*p3FC#|2dIjjxOjn21Bdq}`q*3OdH|MOe2So^Px3xGIu-J@$OjbIAc z2}P=+@#q{4@tE>h`@o+M%nfcInKzk?UYE&!$>o=M@HH;+od>-p6_Ar6(~dSi;@)ZSJJEQNfN;j0~9=HIj0X>3AZa&$YDyBcOK#(jZ;5vn>E zezYh+4C)Rb$I%m*ZcX+;C%HHt4ed()q*rK_EIu&%aVC{2hOxEv-~ zaHRL?mZ%e@&9c}M<2X}zVFnGq@(PDeeV(-z!`j8!oUJW44E%HGw(+?BSW8|5j9Pkm zH1l48c+L4vZ9oD?Y$*kBUINLZbNQpqNhOBtiJ1}Aw3@UKVi690C@I$6m3L(OW9s2n zp(HB1MQJ3&5r{^@ITjQ?V*5EfJ?o~)KbhQ!bk?}jTu-m~VpsV^(eIwjq&7XGR|WoU zCXAy3KcGPzxJtpq91WbC)Xcz1qBr?S#I`*g$Nyqibemu6J%rd^tcZkX^Kfd&DNH6$NX7f*R5Uq@@Jk5qs$8H1B^AhCiBwb00ozB)*oT} z@jhN&5|b6i)&Vfl2on_8YaI1Ip;sH-g_O1Q+U-S(6GJ zx?e9EnM!Y|Xv*=}KQ1$u1us$XSu5&jvj8a_3G60tK+4IR4ZQqAb^qe^eGC`)wl`Sw znjAfw;3G79cul%HL4rPM1p&d{nV~wZnF>$FlXfedBq7YPC2;gHFe3XWM1wAHt?o_+ z*@90juV(DJ9cey5j9-l;l$i&Rh*3YpcpFgN>0bdY0S}U>co*G_^i%eWiC^3Kn2)DR z67Rl0=AExC-W0ZMI@c!tow?J3RNCpzE9|vfzafPmlz<6Q;8_XkDDAPgS28+2JTg?b zDZszQ0!Zy;PT%fhCbBfN14E}I1XdEPWUZ2`AoJZA@xTvCYe&bRJ)5sFD9A2@>9(1U4jV1&94=HTDO&llPF9y1kj_)-o zJ8%V<9sNnn0_}t8S>WQePFn$dG&mL%cyJfPWqw!3HeE&s9PBPwkrtNF?#$d!Cwn`G z=(1m2n-#qOtlO99m7B^iv*tGQ+Laws2^K^nmPF5^^-pvX^tio5No7>Df&Vx9z$YNG zf}T&oiT1Nbk*{UrGjBgs#pg5Lb;~2xzR88xD@1rf*RR;F3 zb=&5k>myZ;2B@_!fOsSH<|$APb0tw&Zf!U3bpa<>#W}DX3MJ#j=e%nGCgGkkE4;|i zN4B7TNIe$JbgO9Dpo2O`DZ%=+J|BJ~aN(zkkoIm# zlHuSqOe3TVbeJk`9{8W1xE_+GiTmFt$_0}UK0oGzyalXDMKO$T^WDVm0Z_{}+?Gy+ zNPm6Q%#Fh5v?a4{7(8f3V(Zfvj7kQ5sV_uLU;suybbWHWiybQ)XA!4824uoHzJmkx zi}tXnx%UB>X0{<3fdHRn0#Fgy$Yu$^G%v58%^?u~W=up33)cP#*oeez-tw7HPGbZf zSsc_LMJC^6tks-h{KkWnT9ta6wcb&P6<<X?ww9c79|Vh4f` z`r~1}=Rj)K$5F4raVCc%cEeB#c&q4L#=@=}A*@9Ed@X!|P{fa8OxprPC~Z$o0#4;h z{K{tFjm<937~9E$x@r1eoNiScv;AOu;9(EqVk7-^9*N3g$b<)dXv21=PX&)D;&IdlO4?9_?M!5K04KpF}blL&+~a{|HN+)+<$4 zemA8w494IU54Yn)--faQck2TwnP;Y)#7Nhz^o6Mq;aMDhf-o36 z=>AK3kx^F9*UXzWa0=KL?C~)Mmu0RBDea0Vag(|H6lslMwN_S61uAOJ)Gj{e62+`^ z%>*ZoZ{)kI;Tt2OqD|osc!*G56-5}h94KFVr}WcUFhjw*$$E4|0?Ddb1|V42SuLFiAs54pO znqzNa6!v;%AD0EZQ+d-}7;38N3D?gTw2XYrZ~uV5TmGtJ^3?)L1ctPP*>&Md8*kUN zX+c9g@sVn#+75UX$PIm|wkni6ALl>KzH4A6$V&!nX)JQ)g}v%Ydb zMyNDmTK7>0-{^L+ksCzfZ6;v9HH6oC&=tVnN=h}o!ej*~4` zeEtbA!@y(rMS@IkaBAj6tYTrreR>(LA7Ca@7>OBD`*92DBCyfEW6H_mOQJazP9uQK zadqLEhf)Og&KwnrEZ$aynOlT{L9E_sE7y4GD5c*tU%LZC%GLh_N65I=wm6(nO60l( z`V6kJk(+F+xkJD6`<7~baslk~8O2Z9KI|MmPjdn4-kk znzWN>#~5i8P9O79>n>hrT?hYFeRmClLrlMQ{`3WJu*+d}lNas~;fr`lS(3@m9t8uq z#td%*e&=tu(d2*zm>b~l=l?KmxE~iZhR-D2*M74t3fgVt9(Y>4!dtf`&>y(yL~ynn zjACTu*#9_b^#XJk^4IpzaTI7q#tNvJJS;G-95Tn85?%~r@%pE^uoDLzD%ta%r^8$) zr}XE1J$WJw5fVr3T<%^)8)r%pra|K@V@yS}VbS}ZQ2H$#&e+rou1H^daeiW0BcC_o z>gdv5{NrR42LD_Oj7hC8n^UkmrER+!EfWqHjQ^Cutl$I8lRaW6=ewD}y6Wyww_xOF=yADsa<2@}lqCwZ{wCj80)~L!=dB0Wz`3gz5p-A;ye13UnsU%!|$JOw-U`eIg+G^o~<}G+wW3^mOI5<4XUS z+snb1eM&{;;Q`I?$F(;XdoYQY5A&hs2@Rin=V_5Y9BfCy<@sB(P(H78V97h8HkjDZ zF396VHQOKzA&qp|2fH}BE$FrRSH@vgGz9ZX09ldX0XfJP$H4duul^)JWxm=7k0(=&uM3-H8((VivCMKr_n76?aFPbE||1e zoCG$`4r$t~@X&sq^B1B*V|Va`_jWP_7S7#;iNmCqIm08!lV{%I-eZ*0wVxC$CbzYQuLdZi+V{CfP0L<8%C8egmK zxA(&PjNK$-GDSFlroE0s2=(K`SqFQ*a}xKM^b+@ETi}(}zWhj$6w|M~aTqE1{9#(y zmmI4G^D@(ZGZ4)*rgPd(pQ?Cy+vCMU8Lf-1z49{KZ)V)Tb_Nn6|3UAAzE7fc zPeYfz`b71rf7Yy@1r>e|XtnWBPr z%ZXc4vd_V+nDW~@6SY$U&uJ8=Bvd}6-Y~d2ws9c0bVKeKf4f1LX`Aj=oqfA$e?ivVUnh_|$E{ zZ|{5K1No0hFF+gn%Ym)8U7vh+22)yez{gkk)Q$JgcUU*A7M&ktD6>gyM25NLlaSPN zA8mP49i1Xh_b2EEa?icXdXq055tSSi&jzM4WE3n`N{PoQOV{GWSc-{2SbEx*=bk<9 z#a)|fB)|D5F~U5DtX~!B_o!WQ(?3!#=nFPa`;NB~3{JNjGB z+036-9x+5(F`74@F>#0GRGAX1|G)0uIx5PojT@Fw1`JXdR2qhmZjeTiMggU}rAvkq z=?>}c5CjPiDcvBUq}0%jbVvyA9?<7G&pF@v-v3|My=Ji(nYs79_rCVE_w|bh5b14A z(T{i#<&l<-ig2Sr3aBW=D-P$xuL@Os($pFkhdKUvX)r6_l6&|-s4733v$we!Q&3Lv*MQ!W zs{{FL$WW2GBumk_n=)bXBg#;7!dMBeAW7eW4_d#90T@Amr(qu0U7KvJ$HsG6Kx zi`IyuCM6n0j~il1BHq{K{dH!3G~x&xYxWKcj-(gakF$@s7IRc*CwIqZ_y<;JkL~96 z`SQ(LF(1ZTOt>q;E=o1*mb;l#)Z=0qh!*4SkH^NOEp0pxj}qzDC->}A!mv&))Ws{O zm8eb;SUgps*GScvM#p(M_eH7IX>}&qSMn=3@OgXBTjXsC!F|doWuxd1FFwQyMKa6_ zk(oLVF#%(3p~)ZF#$*&?+zq#UsMjgx%#V!&gzM4Chs0I!(DC8(fD_)EqDz7YAwk8F z6FXMfu?^i2!)^Q)1>7Bj~{jN9Q{f*0@V5kn|g9a`@lHw8dS zG~3Y@qD=0|?D0Q&F+}%*vvT9nM)j+9;SYDZm8eF$Q^j|dzr^)C9Mw{nST8<5ir;s8 z{SN9+P_7<>)j7(dlPT8O_%Y@MXZ3qHX?4L9`nr0XUJByvaV%Qp#r8v^#YIua{qaQ! z8)6y+)c2@okL;s8@8@FMG<30;YhoD@)~!p-O}qyB`VLPX zfZf%`k|ze~w)sz08pyE*(3Zyru{^x?aNPXOumJ@CpnQel6qMqGGI!IvCsC6Wpw1e; zap+U_Ut9R%tZo(5Kb*31gNa9mG+ZN+@AwJf(-gu0HMJMK8F^Bfaa0Odpr-7Y#?gkY z{hyQ`OPa|cDR?=VnB#u_J{p{&+F_>%ReH8k%wyiQgfq$!ztHIg2#nW%l3FLJBw1+m zWkjNch^SD63lVKJ+40oNr>vsKX`%4JOe;2g=4iRhqqdpo{5YJ*Pha5`%AnHRKaa`x ztZ*zM!*K$?Pw>q|o89w3fyB>7U4B%%lPivEj_lSi+~^d@ffAC*#+c7^e7jue?ud#bK1Vgri0RRzc#36``&(VzMr5eJKA^?XWFR=SVw#0d4k*>6K5I@0 znL33do)bIUpM@(CPuMRVffDTA7RuABr0S{e==slWqNCOkOBvOQI}{w7niR{u3er|h z=m%>r<#4#TtI%&w^?$gbQLQd~v`Z=h%!h9kbO#T#7|L3>^(E$A@(> zg`zhHUK0>68Dc|^1jm{q1}gxN7X>7qXrz7CpLedO1GCSXwRfXtqySIl!-mpHld)t5 zf(ujBmsTBb8!wDn`ONTmGK=oyI z?wF`w#kUG$H{(bmn16Ij+kZ;LwkwTh(-W=KQTevONl%65Ulj3z=<#ijFE{uV9x+=< zkw3uHbxZ;5pzaFG38WIrnvvH7CLoO*r_?jC`g)L2Qm5#U_I%6#RXUxUS7z%|gr}(WuF%gx=#-@5<+<^Q*&Y zpzuPB-=}Z+r4Ro#PuGv!6(9DyA(b86Q5!>+KznSN`!Pve5X|#KxwNszr%}6B02S1~ z{qk^3_hri1#;c2uad9;2+EucplaU?NF6A=R201B|5}i#0ZPORk8UQ$yFZ1iLwk+B2 z{7T7iQ4PNUA7n7gmtqV)94HG+P2QI*6a>1MnXL*0)}llo5AXk?^R89Py0gE-C_zf# zK6~u0wIT(_knSC7UUSOp_Kz)sFZ%&#W&CBWD7F_;MQbOX#NLV*0Tyy5Us_Z}0>2KcuS{}JyHAhMR zg1QpWES{ZjrH;$XNAejT-23hlb+quk>!*%ENoHSBt}r}Uw=LD=WY_`kai11$TA}0r zk9yuCgOC3g^}Hz)wBPD^S7ez85g$C$mu%LOkn3AF=;B3Ie)jl$%WmUn4WhyJOV`zJ zOM+%}y&lW$mm=Vs@1crySs!_32b@D*#P{SuRH}>P`anWD> zglUmE3CSB=Ikv`!?owuMtBz9_Dq=37Z+~3jG#@|k{D&gjVzQbdbe6-Q7q9we&u;yQ zeuxdqCk-GVRH}H(Ct{-b19|T&q8Krl#f>hNziF|J8Vskjt%sXB!xK3R)Kqtg*{DC$ zV{gSK1Y>u8G2n(r>san4iSUXtxD!=r@G%*Mz)uOQ-QXv>(loL@z&0b$`{9#s6dWw9)sy5P7 z?k3G&k6WwSy`xR_5aLS;i>^(>bG65Ybc1 zvT19$|Asrt6eE@U3OwP-D48z5r6!tku*SmwK7RYqr+4>T_ykQc?%hlMGgpHJqyQS~ zk6NVc83{L^x!pnNp4sicC5 zE0SG5Zv}YFVTrk(b$#)wK%+PB!0QySB)R05BKiG_KzpV5gi4Vjelo5NJ!%OyE&Gx) z2&T64(Y_B^g#Y`6G^%Smiwtr$oIU)xutv<*imQc%v7qP0Hg#$+q9Fy7SK2pjCBZBb zqD%A{`88vT4Ata?n60gen)i)|iG7Ox<`qv~j?UgaVVI38n|K2!())PwbiS04o=j7b zLgAIQiRviOtsSWh_r8?z23m?HvE8AVnT_A(J~nXrcP7{qA3UeE+%vJ9M&GVU|1-sn zaiNnn0X@^P=c%x)nj6gVx6WtdVa~mb+#5Xen9n%7;7c3tkO5E3Pb6W^`K+&Zwm%(E zp9HSsdpz@XAg*OL{oUXQHFS05?uWe(7vEi9g=TW3JDadpGfPSOiTQS1a%jEoRc9nNWBnm6@8q0X`OOli*5N5bDbfttGQL?b<1q^ z0AhHK7OGqJ6#`r3inTF4s%K<~pK%oXMjg>W7>nFD0>L*M$6!U6t$7$E>26*WA<%Nt z9IjPXPt?`AOteAC9n14{f`APHGwzLbfd72d=ouB>nYr@afmqz7=i$_NPh`IJybcY4 z+}SU4JwD_g1&E+t-&>9r8dVNxOt6*Cr+GVyrcLw*Zq#Xkie143V`5kwEMD)-!|myy z6c)3*0lYRzVNDx6niI`+odqBR`+#unyOi5u#({n>DpO`mi4%|%(ZPTV{uvkK$`FC? z6{a--gcllc_Az>dTvO}p`P*s?wXrhdBa#5EKa*5e7XX!4f1T;`*{*34=0=p~g_R^@ zUX04VQf7=O7qvOO>PL#FGb1ccUurGcM9D^J7VW!c2#>F8m~V>^m(OJXoDbAre|&N{ zomU`nLJ*Zg_9V)1XPXjPtuQ1({g32Rq@z_ehY|#~C8Vh{&S{9Mih*@0jR8hckjNH` zQ%w%Y5XsNCpTEA-&AglNd4qkj_i(rA`p8so2aT{e%SNE zZa_Nb;7Y}aJ+X0H7`uDB;f|3*rP{}a;*)562rZ{e;x^acBgWOwI1=cHT${*`PV0L3 zz-Qg4$KIG?aIiIU=08qnmw1EA+hT;A8HhO|&i5j8Wp0vsO1ZD<&B;Y&2J+xuGHR~q z8-HGXt^@S*D5ijdsTq|`K4zPz6Q;Nwo_$ZkPlEodP0|0nimc$n3G}FKDRC4`^Qmmi zV%*sCt&fLZ^r4oom_=T6X6* z@xYY(+Xdl3wLY_bJ12#8bW6W${)bHVv!FQ9x0#qa82=MWFTVo!sqW?o^w@PmYLA!3 zzA`nW>i9Z;2KS5I4<=Ig=k>()01W~ELamhg5{3Au(a0u=#N)xI;OSMj^|2TS(C-FN zmB>6?9=1&CKK2z0%#<9)e-fpvXF&5mxAyxsJtb))1P5q4kADZvzke+9lV1UOLaEza zBS_lczpel{k~VoxXz%s!)_~vSBN>2PL#kKhtp9ZC)eq!zAOZ*e)ab#y$~^w3YXHyP z4^#{Pp!xp!%|LY!eDa(%+i!6H-+BmnBN6w&Z7GU}IwFVAzb?vD@W}xUZOPda{$W|mxE<1P&S@bRNug$ zPT^4pphCm|>=kdvY{_7?y$RAfdJ&NJA^_CMrzL{#l7D>F3WyYv!MR&u1MI)RtyZHi z^(0UM&5&VHA4)_DXLoR^P0y~i5 zLeg-6W^#CDAO;EPe;=QSE+`nFw+H8329Y^T^1^wbTIsY@{c4!OM0nseA+zRtrS2%& z@JWY3IiWl!z)w0ANK2p{Ez+QMnzZ#V)M?PJ^8`qUrCS*GpuR33H+l4pzVQ-P`KKcq zK;P}$zk3tebs{6=*ZPBiiH8gT8(mL@B5{{4Obh*K!gJRcRZB#c9lB`ZOzC8*WVZqJ z<8DYDc#Ly|vKv2fBU?%y$arh~cVI~h0t&mF;G+>FK-UutV<5=kyji{Mpv_Xd?S+K5 z^rZ>2d{8Ow+pnhFlGM~p?zf#e13OGNZJEvNyKLT4Dc#qqH4rzx4 zKGSRqLBWrs__1mZ%M37i%dmakHR7Cjl>yMzfWZ-dEzIUes_Uk${KesfwaX@OvyTLn zLTONVy`IVp=)VLD8*gW2E)q(kj`WWaLByE zsZ#q_Q#IvF3{A;FgiH*8>F|~NtFS7C6w56=-a}H0v9c$C570^FvI%(J!v?2UqGDdu zr9e?fSsQH2N9_^%1kG>0S^@Xu@)N-A&IacX>UlP^%>%&vjL&w?$1?G6ofV*oy5x=b z=RwZ(5S{bluc4@fRROjqJua2u%1#Tm@P=q7Jaxr6T5io2K<3(si-W2FEAx>NV8d>H zyK@Iwd8@OsbKo7-cKyL6Xwn*41#Gb%9XwQipKb)$FN97&B8_)Bn23ec#0P&F4l1AW z8d5(j#{cS~ZvYIs?4Z?S3{tqr7X*M+1a>DF7T)2Z(&QZtWzBR9d4`KrFq>il$CQI6 z03910TJ-icGpGPB^Dik^Sg0*SEwjQ{5P;JV791G(Emu}GCYCKT#FBp#ASP^KnjQH! zU*<^1*oDb$;^#5G21ZfuN15yP2dV(oW9t$FNI9w zJqzWprBVS9a+-_jB#=7-)hXG+ye?2E5X5Oxn%kajAq@vYHZlPERsKMLW@c5l-ZWUQ z;uZR}<(kiI>nXE`he%8*p=U=I5|%VsWsBs&9nW}U@1FtqOm6BuqweT;05qd^=K*i} zqRY`{(I|k7*gFvk9a%Q0y5DdX&CJXcAngbzb6Qgik0xYN=Uv{ovt%MfZh9vkD6g-W z`3zeCafhQc_#7r%rr$?H?`o*1bpQR?r+`{VHr0V&Ya3Bp?smM3x<$|fIE-4Tgxeb; z6W_jl+XaFu<|NUdfcmx4T9bY&S+BVFah15!3>)~bR5hJ|TbEZDQEdFZL~ncq2qOxL zl^`X7mf3`;bTI`YzGAF7tPM#OH;D)QNsmE5d?ycdF|w!i`ofWs`;2_Mi#E-4<(Ak} zgxj86o&-1)bDVx8Knv8b?+Kwcp@aj136{?-2sS7v;QM^~-dg%Hd7Flp{DBcbM34Jx zaH;`UxK!`&Z&{W>{a`qPkkR`LiI%zc_Juw%cz$58OIxEV6XdOG32EE=QA?nj*l2_mjKbr@aPGlX&knv+;6v2M=l#P zEGTf{#;9wDKj*_`z{y%l!N?#wAK#4GTTLk9#`9UADqaQ@hI$mf3 zsdZoI0_4IQo0K(MGMuH$I_0(9h8xbUQSoBK|-Q^gvUA6*g}W!e&dpdB3mAWeCmWANzBo-i%! zD`xTU^;Q~;0w08`{N?%B8O=6$rmyHA-SamEYq1KeRa8AIF>9$O{dTueJ$VAU-ccaQ zUj0~3#HZIMe?38HSI&t6EY*<4_2t>cYrsK;@?siM-8RnUXzTVSjfYw?ijulO5(o1h zp5lvP<)OtUG)qv4ag@?LeVffpQAGYNHozZGG0?R&=P^X;`l6H%gn~Ik-8~`*qj>Lr z!wxZrdGV)_`s6|PzOlTEg#C^3rDSvMH=nz-%?sJi(WvChaR$oVPhS@|-!}piJ1|5z z1m7ENo@OIo1hsqZZaO!9d4iV0McAS5ziBC$U_no4U?iMn-=r}w~V(Mfv1_^izaiDnXN!k2tr2+t)Mi6k{p->pyE4@?{4#^-hEY2rnubIhE z9Q__3r`qtd5P@c~3YGe6Yr{4?z_MaxlV6;~ zMma>kFZ%`*Nbw{F7ie0>#awB`N}FfIjZ<fvU%&(O01i72?~vi~RTW;9ePIe6vFtQVr4oQCuaQ+UnMawDTl5=X2c z#Je7HoeDRzDmvFD9+D9#xHa{8^i&_-^a*2UvI5TnXYMvh)QXHGzBZL6nKt=5Kj^&d zpb%1;(ooIE4H804utq0m{uDyka+_vki}r--cYB!7e@;Pg9ndGfkx!6ylb^z(<7}31 zGFR#L+|1&B8PKs;hF-+6Nd`%x`754CGpiLUSR2e-)2SmuVZ)LoHNo@I1s4Ye7speI zw#0G$Wj~_1=Kpm`Do01klT;C_s30TNt^EaVa+mXoMcU9odaV<;YxO+7!euXpwJ)Y($ zXQ7pJ@3Okml^WMQ!kzkk55Q9`BgXBncIH|*5q9;C?v)$rINNT)meFy2+fRrZ{lil@ zsYART>^SOr1s-;v5n?Sk?prp2rk3XhDYV-dVbB_Gm~UQ2U(`;#TvW`%qL3I)ZkfY>0d^?;S^!H;&jB&enG z@>4SRp>;D3G+A!(m)wH5#T??&t4;uQ6@WK*2So*3DdA zbQ%zVoE^1jf9E_*k&5x&JOzu6Blq5N|5l*qDeD{m>hOGoYpd_!E`8F7dU9Cna`kfR zy6fDzRShA+G#`XcaBVzAmu;ccPz}8hN!T{;Pua7vCZ$45H|w26k!@6XK%W=1O+J2f zVyCnXatp&RJ#z;zm^&Vvcu6aXTB=K-d?%VIeY^y1dO8bd5PD8$Wjfk&?duJ=A z=KO3bV-^&gM$E3kQA{n*D!$J*e`Eg{EyHHmxRezx#NcR);}?1o`p$XQ@USn@>x1*# zsvQH6h}l-#b0|a(bo^y1oz#Y6VkOW_P9TAQOEorV>^3zU_g=iQ6aq;2SEr)cFCUaL zO%`Mc$@hHem|&7oR9;p3y-?i4L|Qy}aD?$K#h(Fe%WrZ0* zM2{>Hm3{5C@P}EPWbJiP&feTkE6*RTAQ+QtIs#I9r`nh6>9q^mUL@~P2AtyWNr$b~ zZ}v+sOYwpw2#?$x3)vyxr!iGkr7Hfa8ADe*BF4IPJES9taFE27s$J-QEjJF&?X&o# zo0b)<>FVqCkJGrtoK# zV5LL38N<}>GI+5|ged@>&roB5bL$r_`-)Y1!0YRQQxIa=@4nT0?D=ICOHxvhs9E9R?%{Y~VjVLRLPzM*D0 zPhPAq1IK|ycRAgLa)mm*h&o#`z`@I{1$!i9)_AH3FWGIr(LaNv>E9BDs0ai&=eOLX zxJlOCFR&L9Za~o_CG?srnTwde|Eezi3Z%>4>=xC(i$z! zP6Xpk9ih#t`RmZIhAXF_W}ohhH}Ci!mxedF-TpQ2FMp5r$+7Z5782ITHwa5+DD`ME;~C#zLN7r)ETn6};#&%oPt_rr%b; zzePXn0gqJ2((rsZ_x#67$kG?lb?P@sN zmEp!6zq`^~8IC>~VB}J3gU6VR(-P|IiCVl7}Qh^w6(f zYvnIBzk1DKHvo{_auX6Or_#(LQW1;xPBS{`0_pg{551o*fTzcScpA+bT|#yu41(ji zmPTxA&ylFW6LQpf8r|8nuKl*NT_EtyobcY6xSRA4e{8=q z-9V2w2T}V-I+ur`S0wUd%^E{L^eHSJs9(?~EQ^+5Y(_d4(+6uP9rAO$iFRfNZ8MYz zYIP%U#Zf>CT$#*$Dk1H^15J?%md)koSfDQJIbfI{<1x6C$k%?0rmOa8q**-8ppr~B z5$yENn7!;|9b;5o0+0N(>!*^mec-Wia%6NP+>oEjs8BU)^*zlV>duLkX4kd>DUVQP z%+bNMYnW(Kk42SvrnAfnV@f*0`0kTzC_xRZrIPEEdm;m&W(?Uqum)6x1%#g zTmPGuT|EF+dx+{%m7z9dwWQ_F!_^G~r`Vl7?1uA`_4oJo0`*$UYH0?krC{VqAnXc~ z1&1w_N6Ok$mFFy3vL`;~*9l>UNod@Hg_w}CST&YLyfY-duj~vcU<&HMsqwyS&nbo4 ziX>AK6RA9cryHNT_EjRCGon9wZ}f^KOzJkZL3s;q z;IDbjCYYpl<;T4@pL=!hwbe>vqy8Amd?Z_1E^Ztn=a+Th$uOwLnlZpRUfbK8b7s$Q z(K%?yuxf7gd+|V~SeI8k3OeH?jo|)Q@p}AQq_L|X4OE~?bfBntcd$#6kTMDZc!7qq z3|;KyH2B6$S}lz>U}Dh!f6%6T-cbcJ9=8qj9sJ3MeFRR|p0fXlG;H3j7ZTvw z4r=6R^D%hz@k3gO(=?Q?*&wTC>Ex%}GNE)7<9*$-TRvTXsU;^Eb<4!ofB%eK#xp2o zTO0w-9&>UXS+R4*M?yaY9=Lc6W=O$c>m70)+x&BnI`)>q_$D3neJwq#VJ4d=RRBG0 zu#AL_aQ?I7$Wb}G8-c8Ns_b4fk3qsKX*xM~C(BeA^qQ=|P>a5-J#~srGbTw~ySGb} zil0%c3^0N7_aA6&5yW2DnQ;eJijE;|nC*AE0bnOZR(Uuav#p7|6fE~VlF6^f&9tX~ z^uv#CV>G+al`dKB4NYuEG;y>#s1ATZxhYY=ns}>>r`CmTImIc}G4);qX;oCrI>NBaB?Jr}CColI|1O6}M-f!(<=j zx|cl!!O>zaQQ=>~5rX0%$tv(HmRT#2@sad=-!>_HE|69;&Ic!jHPHf5%Lf$sn#dw_ zfDPPVK%@ksTwZeEfplrLJ`(2B{0b&7(GZjO9jav5%-Hn zoBeKOM0sjFslEQ8FEpKjltzy?h}%bbs~ViYJQ1hvCU;GGEQt(TU>KVTi6t#n_WiGaUR~DyG0uU!{?%l>7s>`XLX^Ytl9l*^pIf! z9wA4OP6MLy&7d65-7Lxb!BO)3T}iBpz^t9ANV{&lMfiQIq-V#|1F}R)_P^``GC7OTSS%;wMS+r9>VJ0Qe9E7QUIlv#| z8O1Ny2UE`Odt54Jf;V73&bjUUr3M7ti=%#J>evDMOqh>u7MK~2?Pi0SZ#@79P)ixS zKr~QZj}fy4YyN0X`u!m8(+9IRpdqRuPJpD~%1zCwit{Fv4HOZgjQdkbzA&eK6VeKS zozcF){S(6e6U%TsJt6Vm0ARwZ!h#3M)?0J_0k(QuzxstvH!Nys-}D?H5 z+Zh~sA8anVflkEvB00ZTDltewKVLx--zSp_$gJGn;vM{ImCo^od%VQ4#A&^%Y}C9+yiH+$Sv1-IkG9NP?bhVkE6e;jak3tB59 z`D0S%am939!nZGVIz7-u3#tdND9t634?&%0|F2#f8Xt3l;1F2W&myzle}_)+8*y54 zxu^`P5%mAhce5l#B%+?5jr?W&&+&%r49^VY;mk?^mnU*}@z1M~4iCIb(7;us_RlbZ z+$`#@pc`@$^Xtp~Ed%-QJ+KBL-)*y&Gbj9a_oLuK`B8}F{K8c6|M{htw-G`K-j&$V r7)<=<9KW3Ge)6DE|Ianq^8zIthPt;XXgzoh{F8bj|M;VruGjwo`Lm^D literal 0 HcmV?d00001