diff --git a/README.md b/README.md index f013e4e..95fced2 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ ## Overview -iText2KG is a Python package designed to incrementally construct consistent knowledge graphs with resolved entities and relations by leveraging large language models for entity and relation extraction from text documents. It features zero-shot capability, allowing for knowledge extraction across various domains without specific training. The package includes modules for document distillation, entity extraction, and relation extraction, ensuring resolved and unique entities and relationships. It continuously updates the KG with new documents and integrates them into Neo4j for visual representation. +iText2KG is a Python package designed to incrementally construct consistent knowledge graphs with resolved entities and relations by leveraging large language models for entity and relation extraction from text documents. It features zero-shot capability, allowing for knowledge extraction across various domains without specific training. The package includes modules for document distillation, entity extraction, and relation extraction, ensuring resolved and unique entities and relationships. It continuously updates the KG with new documents and integrates them into Neo4j or FalkorDB for visual representation. ## πŸ”₯ News * [29/07/2025] New Features and Enhanced Capabilities: @@ -458,6 +458,37 @@ graph_integrator = Neo4jStorage(uri=URI, username=USERNAME, password=PASSWORD) graph_integrator.visualize_graph(knowledge_graph=kg) ``` +## FalkorDB Integration + +iText2KG also supports FalkorDB, providing the same functionality with enhanced performance for graph operations: + +```python +from itext2kg.graph_integration import FalkorDBStorage + +# Initialize FalkorDB storage +falkor_storage = FalkorDBStorage( + host="localhost", + port=6379, + password=None, # Set if your FalkorDB instance requires authentication + graph_name="MyKnowledgeGraph" +) + +# Store and visualize the knowledge graph +falkor_storage.visualize_graph(knowledge_graph=kg) + +# Get graph statistics +stats = falkor_storage.get_graph_stats() +print(f"Graph contains {stats['nodes']} nodes and {stats['relationships']} relationships") + +# Run custom queries +# Run analytics query +result = falkor_storage.run_query("MATCH (n) RETURN labels(n), count(n)") + +# Clean up +falkor_storage.close() +``` + +For a complete FalkorDB example, see: [FalkorDB Quickstart](./examples/quickstart_falkordb.ipynb) ## Some ```iText2KG``` use-cases diff --git a/examples/quickstart_falkordb.ipynb b/examples/quickstart_falkordb.ipynb new file mode 100644 index 0000000..d4ab341 --- /dev/null +++ b/examples/quickstart_falkordb.ipynb @@ -0,0 +1,602 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "33771720", + "metadata": {}, + "source": [ + "# πŸš€ iText2KG + FalkorDB Quickstart Guide\n", + "\n", + "This notebook demonstrates how to use **iText2KG** with **FalkorDB** for building and visualizing knowledge graphs from text documents.\n", + "\n", + "## Features Covered:\n", + "- Document distillation with custom schemas\n", + "- Knowledge graph construction using iText2KG_Star (recommended)\n", + "- Complete FalkorDB integration with advanced features\n", + "- Graph statistics and analytics\n", + "- Error handling and fallback mechanisms\n", + "\n", + "**Updated:** August 2025 with full FalkorDB integration" + ] + }, + { + "cell_type": "markdown", + "id": "b8ce00a6", + "metadata": {}, + "source": [ + "## πŸ“¦ Installation\n", + "\n", + "Install the required packages:\n", + "\n", + "```bash\n", + "pip install itext2kg falkordb langchain_openai\n", + "```\n", + "\n", + "**Prerequisites:**\n", + "- Python 3.9+\n", + "- OpenAI API key\n", + "- FalkorDB server running (default: localhost:6379)\n", + "\n", + "**FalkorDB Setup:**\n", + "```bash\n", + "# Using Docker (recommended)\n", + "docker run -p 6379:6379 -it --rm falkordb/falkordb:latest\n", + "```" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e98ac07c", + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import asyncio\n", + "from typing import List, Optional\n", + "from pydantic import BaseModel, Field\n", + "\n", + "# iText2KG imports\n", + "from itext2kg import DocumentDistiller, iText2KG_Star, FalkorDBStorage\n", + "from itext2kg.logging_config import setup_logging, get_logger\n", + "\n", + "# LangChain imports\n", + "from langchain_openai import ChatOpenAI, OpenAIEmbeddings\n", + "\n", + "# Setup logging\n", + "setup_logging(level=\"INFO\")\n", + "logger = get_logger(__name__)\n", + "\n", + "print(\"All imports successful!\")\n", + "print(\"Logging configured successfully\")" + ] + }, + { + "cell_type": "markdown", + "id": "d01fd014", + "metadata": {}, + "source": [ + "## πŸ”§ Configuration\n", + "\n", + "Set up your API keys and database connection:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "12e9c03c", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Starting iText2KG + FalkorDB Example\")\n", + "print(\"=\" * 50)\n", + "\n", + "# OpenAI Configuration\n", + "OPENAI_API_KEY = os.getenv('OPENAI_API_KEY') or input('Enter your OpenAI API key: ')\n", + "\n", + "# Initialize LLM and Embeddings\n", + "llm_model = ChatOpenAI(\n", + " api_key=OPENAI_API_KEY,\n", + " model=\"gpt-4o-mini\",\n", + " temperature=0,\n", + " max_retries=2\n", + ")\n", + "\n", + "embeddings_model = OpenAIEmbeddings(\n", + " api_key=OPENAI_API_KEY,\n", + " model=\"text-embedding-3-small\"\n", + ")\n", + "\n", + "# FalkorDB Configuration\n", + "FALKORDB_CONFIG = {\n", + " \"host\": os.getenv(\"FALKORDB_HOST\", \"localhost\"),\n", + " \"port\": int(os.getenv(\"FALKORDB_PORT\", 6379)),\n", + " \"password\": os.getenv(\"FALKORDB_PASSWORD\", None),\n", + " \"graph_name\": \"NewsGraph\"\n", + "}\n", + "\n", + "print(f\"πŸ”§ Configuration complete!\")\n", + "print(f\"FalkorDB: {FALKORDB_CONFIG['host']}:{FALKORDB_CONFIG['port']}\")\n", + "print(f\"LLM Model: {llm_model.model_name}\")" + ] + }, + { + "cell_type": "markdown", + "id": "8b864c56", + "metadata": {}, + "source": [ + "## πŸ“‹ Define Data Schema\n", + "\n", + "Create a Pydantic schema for structured information extraction:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "ac3dce37", + "metadata": {}, + "outputs": [], + "source": [ + "class NewsArticle(BaseModel):\n", + " \"\"\"Schema for extracting structured information from news articles.\"\"\"\n", + " title: str = Field(default=\"\", description=\"The title of the article\")\n", + " companies: List[str] = Field(default_factory=list, description=\"Companies mentioned in the article\")\n", + " people: List[str] = Field(default_factory=list, description=\"People mentioned in the article\")\n", + " locations: List[str] = Field(default_factory=list, description=\"Locations mentioned in the article\")\n", + " key_events: str = Field(default=\"\", description=\"Main events described in the article\")\n", + " technologies: List[str] = Field(default_factory=list, description=\"Technologies or products mentioned\")\n", + " funding_info: str = Field(default=\"\", description=\"Any funding or financial information mentioned\")\n", + "\n", + "print(\"πŸ“‹ NewsArticle schema defined successfully!\")" + ] + }, + { + "cell_type": "markdown", + "id": "2b9769eb", + "metadata": {}, + "source": [ + "## πŸ“° Sample Data\n", + "\n", + "Let's use a sample news article about a tech acquisition:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3ef36581", + "metadata": {}, + "outputs": [], + "source": [ + "# Sample article text\n", + "article_text = \"\"\"\n", + "Apple Inc. announced today that it has acquired Emotient, a San Diego-based artificial\n", + "intelligence startup specializing in facial expression recognition technology. The acquisition\n", + "was led by Apple's Senior Vice President of Software Engineering, Craig Federighi.\n", + "\n", + "Emotient's technology will be integrated into Apple's machine learning initiatives. The startup\n", + "was founded in 2012 by Dr. Ken Denman and has raised $8 million in funding from venture capital firms.\n", + "\n", + "The acquisition represents Apple's continued investment in artificial intelligence and machine\n", + "learning capabilities. Industry analysts believe this technology could be integrated into future\n", + "iPhone and iPad applications for emotion recognition and user experience enhancement.\n", + "\n", + "Tim Cook, Apple's CEO, stated that this acquisition aligns with the company's strategy to develop\n", + "more intelligent and intuitive user interfaces. The Emotient team will join Apple's AI research\n", + "division in Cupertino, California.\n", + "\"\"\"\n", + "\n", + "# Information extraction query\n", + "extraction_query = \"\"\"\n", + "# DIRECTIVES:\n", + "- Act like a professional business news analyst\n", + "- Extract key information from this business article\n", + "- Focus on factual information only\n", + "- If information is not found, leave it empty\n", + "\"\"\"\n", + "\n", + "print(f\"Article loaded ({len(article_text)} characters)\")\n", + "print(f\"Extraction query prepared\")" + ] + }, + { + "cell_type": "markdown", + "id": "9c8a62ec", + "metadata": {}, + "source": [ + "## πŸ” Step 1: Document Distillation\n", + "\n", + "Extract structured information from the raw text:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "161796fe", + "metadata": {}, + "outputs": [], + "source": [ + "async def distill_document():\n", + " \"\"\"Extract structured information from the article.\"\"\"\n", + " \n", + " logger.info(\"Starting document distillation...\")\n", + " \n", + " # Initialize document distiller\n", + " distiller = DocumentDistiller(llm_model=llm_model)\n", + " \n", + " # Extract structured information\n", + " distilled = await distiller.distill(\n", + " documents=[article_text],\n", + " IE_query=extraction_query,\n", + " output_data_structure=NewsArticle\n", + " )\n", + " \n", + " # Convert to dictionary\n", + " doc_dict = (\n", + " distilled.model_dump() \n", + " if hasattr(distilled, 'model_dump') \n", + " else distilled.dict()\n", + " )\n", + " \n", + " # Create semantic blocks\n", + " semantic_blocks = [\n", + " f\"{k}: {', '.join(v) if isinstance(v, list) else v}\"\n", + " for k, v in doc_dict.items() if v\n", + " ]\n", + " \n", + " logger.info(f\"Generated {len(semantic_blocks)} semantic blocks\")\n", + " \n", + " return doc_dict, semantic_blocks\n", + "\n", + "# Run distillation\n", + "doc_data, semantic_blocks = await distill_document()\n", + "\n", + "print(\"\\nExtracted Information:\")\n", + "for key, value in doc_data.items():\n", + " if value:\n", + " print(f\" β€’ {key}: {value}\")\n", + "\n", + "print(f\"\\nGenerated {len(semantic_blocks)} semantic blocks for KG construction\")" + ] + }, + { + "cell_type": "markdown", + "id": "0cce66fa", + "metadata": {}, + "source": [ + "## πŸ•ΈοΈ Step 2: Knowledge Graph Construction\n", + "\n", + "Build a knowledge graph using iText2KG_Star (recommended approach):" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "71a05d95", + "metadata": {}, + "outputs": [], + "source": [ + "async def build_knowledge_graph(sections):\n", + " \"\"\"Build knowledge graph from semantic blocks.\"\"\"\n", + " \n", + " logger.info(\"Building knowledge graph with iText2KG_Star...\")\n", + " \n", + " # Initialize iText2KG_Star\n", + " itext2kg_star = iText2KG_Star(\n", + " llm_model=llm_model, \n", + " embeddings_model=embeddings_model\n", + " )\n", + " \n", + " try:\n", + " # Build the knowledge graph\n", + " knowledge_graph = await itext2kg_star.build_graph(\n", + " sections=sections,\n", + " ent_threshold=0.7, # Entity similarity threshold\n", + " rel_threshold=0.7, # Relationship similarity threshold\n", + " max_tries=3, # Max attempts for extraction\n", + " entity_name_weight=0.6, # Weight for entity name in matching\n", + " entity_label_weight=0.4, # Weight for entity label in matching\n", + " observation_date=\"2025-08-05\" # Optional: temporal context\n", + " )\n", + " \n", + " logger.info(f\"Built KG: {len(knowledge_graph.entities)} entities, {len(knowledge_graph.relationships)} relationships\")\n", + " \n", + " except ValueError as e:\n", + " logger.warning(f\"Extraction failed: {e}. Using fallback data...\")\n", + " \n", + " # Fallback: simple test sections\n", + " fallback_sections = [\n", + " \"Apple Inc. is a technology company based in Cupertino, California.\",\n", + " \"Tim Cook is the CEO of Apple Inc.\",\n", + " \"Craig Federighi is the Senior Vice President of Apple Inc.\",\n", + " \"Emotient is an AI startup specializing in facial recognition.\",\n", + " \"Apple Inc. acquired Emotient in 2016.\",\n", + " \"Dr. Ken Denman founded Emotient in 2012.\",\n", + " \"Emotient raised $8 million in funding.\"\n", + " ]\n", + " \n", + " knowledge_graph = await itext2kg_star.build_graph(\n", + " sections=fallback_sections,\n", + " ent_threshold=0.7,\n", + " rel_threshold=0.7,\n", + " max_tries=1\n", + " )\n", + " \n", + " logger.info(f\"Fallback KG: {len(knowledge_graph.entities)} entities, {len(knowledge_graph.relationships)} relationships\")\n", + " \n", + " return knowledge_graph\n", + "\n", + "# Build knowledge graph\n", + "kg = await build_knowledge_graph(semantic_blocks)\n", + "\n", + "print(\"\\nπŸ•ΈοΈ Knowledge Graph Summary:\")\n", + "print(f\" β€’ Entities: {len(kg.entities)}\")\n", + "print(f\" β€’ Relationships: {len(kg.relationships)}\")\n", + "\n", + "print(\"\\nEntities by Type:\")\n", + "entity_types = {}\n", + "for entity in kg.entities:\n", + " entity_types[entity.label] = entity_types.get(entity.label, 0) + 1\n", + " \n", + "for label, count in sorted(entity_types.items()):\n", + " print(f\" β€’ {label}: {count}\")" + ] + }, + { + "cell_type": "markdown", + "id": "0cfb5ffc", + "metadata": {}, + "source": [ + "## πŸ—„οΈ Step 3: FalkorDB Integration\n", + "\n", + "Store and visualize the knowledge graph in FalkorDB:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "98516438", + "metadata": {}, + "outputs": [], + "source": [ + "async def integrate_with_falkordb(knowledge_graph):\n", + " \"\"\"Store knowledge graph in FalkorDB with enhanced features.\"\"\"\n", + " \n", + " logger.info(\"Connecting to FalkorDB...\")\n", + " \n", + " # Initialize FalkorDB storage\n", + " fdb = FalkorDBStorage(**FALKORDB_CONFIG)\n", + " \n", + " try:\n", + " # Get initial statistics\n", + " initial_stats = fdb.get_graph_stats()\n", + " print(f\"Initial graph stats: {initial_stats}\")\n", + " \n", + " # Optional: Clear existing data\n", + " clear_data = input(\"\\nClear existing graph data? (y/N): \")\n", + " if clear_data.lower() == 'y':\n", + " fdb.clear_graph()\n", + " print(\"Cleared existing graph data\")\n", + " initial_stats = {'nodes': 0, 'relationships': 0}\n", + " \n", + " # Store the knowledge graph in FalkorDB\n", + " print(\"\\nPushing knowledge graph to FalkorDB...\")\n", + " fdb.visualize_graph(knowledge_graph, parent_node_type=\"Article\")\n", + " \n", + " # Get final statistics\n", + " final_stats = fdb.get_graph_stats()\n", + " \n", + " # Calculate changes\n", + " nodes_added = final_stats['nodes'] - initial_stats['nodes']\n", + " rels_added = final_stats['relationships'] - initial_stats['relationships']\n", + " \n", + " print(f\"\\nGraph successfully stored in FalkorDB!\")\n", + " print(f\"Added: {nodes_added} nodes, {rels_added} relationships\")\n", + " print(f\"Total: {final_stats['nodes']} nodes, {final_stats['relationships']} relationships\")\n", + " \n", + " # Run sample analytics query\n", + " print(\"\\nπŸ” Running analytics query...\")\n", + " try:\n", + " sample_query = \"MATCH (n) RETURN labels(n) as entity_type, count(n) as count ORDER BY count DESC LIMIT 10\"\n", + " result = fdb.run_query(sample_query)\n", + " \n", + " print(\"Entity distribution:\")\n", + " if hasattr(result, 'result_set') and result.result_set:\n", + " for row in result.result_set:\n", + " print(f\" β€’ {row[0]}: {row[1]} entities\")\n", + " except Exception as e:\n", + " print(f\"Analytics query failed: {e}\")\n", + " \n", + " return fdb\n", + " \n", + " except Exception as e:\n", + " logger.error(f\"Failed to integrate with FalkorDB: {e}\")\n", + " raise\n", + "\n", + "# Integrate with FalkorDB\n", + "falkor_storage = await integrate_with_falkordb(kg)" + ] + }, + { + "cell_type": "markdown", + "id": "e8789d31", + "metadata": {}, + "source": [ + "## πŸš€ Advanced FalkorDB Features\n", + "\n", + "Explore FalkorDB's powerful graph analytics capabilities:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "14bad008", + "metadata": {}, + "outputs": [], + "source": [ + "# Advanced Graph Analytics with FalkorDB\n", + "print(\"FalkorDB Graph Analysis:\\n\")\n", + "\n", + "# Query 1: Find all relationship types and their frequencies\n", + "try:\n", + " query1 = \"MATCH ()-[r]->() RETURN type(r) as relationship_type, count(r) as count ORDER BY count DESC\"\n", + " result1 = falkor_storage.run_query(query1)\n", + " \n", + " print(\"Relationship types in your graph:\")\n", + " if hasattr(result1, 'result_set') and result1.result_set:\n", + " for row in result1.result_set[:5]: # Show top 5\n", + " print(f\" β€’ {row[0]}: {row[1]} instances\")\n", + "except Exception as e:\n", + " print(f\"Relationship query failed: {e}\")\n", + "\n", + "print(\"\\n\" + \"=\"*50)\n", + "\n", + "# Query 2: Find connection paths between key entities\n", + "try:\n", + " query2 = \"MATCH path = (a)-[*1..2]-(b) WHERE a.name CONTAINS 'Apple' AND b.name CONTAINS 'Tim' RETURN length(path) as path_length, count(*) as paths\"\n", + " result2 = falkor_storage.run_query(query2)\n", + " \n", + " print(\"Connection paths between Apple and Tim:\")\n", + " if hasattr(result2, 'result_set') and result2.result_set:\n", + " for row in result2.result_set:\n", + " print(f\" β€’ Path length {row[0]}: {row[1]} paths\")\n", + " else:\n", + " print(\" β€’ No direct paths found\")\n", + "except Exception as e:\n", + " print(f\"Path query failed: {e}\")\n", + "\n", + "print(\"\\n\" + \"=\"*50)\n", + "\n", + "# Query 3: Show sample nodes and their properties\n", + "try:\n", + " query3 = \"MATCH (n) RETURN n.name, labels(n), n.description LIMIT 5\"\n", + " result3 = falkor_storage.run_query(query3)\n", + " \n", + " print(\"Sample entities in the graph:\")\n", + " if hasattr(result3, 'result_set') and result3.result_set:\n", + " for row in result3.result_set:\n", + " name = row[0] if row[0] else \"Unknown\"\n", + " labels = row[1] if row[1] else \"No label\"\n", + " desc = row[2] if len(row) > 2 and row[2] else \"No description\"\n", + " print(f\" β€’ {name} ({labels}): {desc[:50]}...\")\n", + "except Exception as e:\n", + " print(f\"Entity query failed: {e}\")\n", + "\n", + "print(\"\\n\" + \"=\"*50)\n", + "print(\"Advanced analysis complete!\")" + ] + }, + { + "cell_type": "markdown", + "id": "106b9319", + "metadata": {}, + "source": [ + "## 🧹 Cleanup\n", + "\n", + "Clean up resources:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "3e598319", + "metadata": {}, + "outputs": [], + "source": [ + "# Close FalkorDB connection\n", + "try:\n", + " falkor_storage.close()\n", + " print(\"πŸ”Œ FalkorDB connection closed successfully\")\n", + "except Exception as e:\n", + " print(f\"Error closing connection: {e}\")\n", + "\n", + "print(\"\\nQuickstart completed successfully!\")\n", + "print(\"\\nNext Steps:\")\n", + "print(\" β€’ Try with your own documents\")\n", + "print(\" β€’ Experiment with different schemas\")\n", + "print(\" β€’ Explore advanced Cypher queries\")\n", + "print(\" β€’ Build dynamic knowledge graphs with temporal data\")\n", + "print(\" β€’ Scale up with larger document collections\")" + ] + }, + { + "cell_type": "markdown", + "id": "8ec9c8a2", + "metadata": {}, + "source": [ + "## πŸ“š More Examples\n", + "\n", + "**Try different document types:**\n", + "\n", + "```python\n", + "# Scientific articles\n", + "from itext2kg.models.schemas import Article\n", + "\n", + "# CV/Resume processing \n", + "from itext2kg.models.schemas import CV\n", + "\n", + "# Custom schemas for your domain\n", + "class YourCustomSchema(BaseModel):\n", + " # Define your fields here\n", + " pass\n", + "```\n", + "\n", + "**Dynamic Knowledge Graphs:**\n", + "```python\n", + "# Build evolving graphs with temporal data\n", + "kg = await itext2kg_star.build_graph(\n", + " sections=sections,\n", + " existing_knowledge_graph=previous_kg, # Incremental updates\n", + " observation_date=\"2025-08-05\"\n", + ")\n", + "```" + ] + }, + { + "cell_type": "markdown", + "id": "f047f927", + "metadata": {}, + "source": [ + "## πŸ”§ Troubleshooting\n", + "\n", + "**Common Issues:**\n", + "\n", + "1. **FalkorDB Connection Error**: Ensure FalkorDB is running on the specified host/port\n", + "2. **API Key Error**: Verify your OpenAI API key is set correctly\n", + "3. **Import Error**: Make sure all packages are installed with correct versions\n", + "4. **Extraction Failures**: The system includes fallback mechanisms for robustness\n", + "\n", + "**Resources:**\n", + "- [iText2KG Documentation](https://github.com/auvalab/itext2kg)\n", + "- [FalkorDB Documentation](https://www.falkordb.com/)\n", + "- [FalkorDB Cypher Guide](https://docs.falkordb.com/)\n", + "- [OpenAI API Documentation](https://platform.openai.com/docs)\n", + "\n", + "**Support:**\n", + "- GitHub Issues: [itext2kg/issues](https://github.com/auvalab/itext2kg/issues)\n", + "- Community: [Discussions](https://github.com/auvalab/itext2kg/discussions)\n", + "- FalkorDB Community: [FalkorDB Discord](https://discord.gg/falkordb)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/itext2kg/__init__.py b/itext2kg/__init__.py index da8dc4a..8b087ff 100644 --- a/itext2kg/__init__.py +++ b/itext2kg/__init__.py @@ -1,7 +1,9 @@ from .documents_distiller import DocumentsDistiller +from .documents_distiller import DocumentsDistiller as DocumentDistiller # Add alias for backward compatibility from .graph_integration import Neo4jStorage as GraphIntegrator +from .graph_integration import FalkorDBStorage # Add FalkorDB integration from .itext2kg import iText2KG from .itext2kg_star import iText2KG_Star from . import logging_config -__all__ = ['DocumentsDistiller', 'GraphIntegrator', 'iText2KG', 'iText2KG_Star'] \ No newline at end of file +__all__ = ['DocumentsDistiller', 'DocumentDistiller', 'GraphIntegrator', 'FalkorDBStorage', 'iText2KG', 'iText2KG_Star'] \ No newline at end of file diff --git a/itext2kg/documents_distiller/__init__.py b/itext2kg/documents_distiller/__init__.py index f61718f..c0a1455 100644 --- a/itext2kg/documents_distiller/__init__.py +++ b/itext2kg/documents_distiller/__init__.py @@ -1,6 +1,8 @@ from .documents_distiller import DocumentsDistiller +from .documents_distiller import DocumentsDistiller as DocumentDistiller # Add alias for backward compatibility from ..models.schemas import InformationRetriever, RelationshipsExtractor, Article, CV, EntitiesExtractor __all__ = ["DocumentsDistiller", + "DocumentDistiller", # Add alias to exports "DataHandler", "InformationRetriever", "RelationshipsExtractor", diff --git a/itext2kg/graph_integration/__init__.py b/itext2kg/graph_integration/__init__.py index e7170d8..dc50269 100755 --- a/itext2kg/graph_integration/__init__.py +++ b/itext2kg/graph_integration/__init__.py @@ -1,2 +1,3 @@ from .neo4j_storage import Neo4jStorage -__all__ = ["Neo4jStorage"] \ No newline at end of file +from .falkordb_storage import FalkorDBStorage +__all__ = ["Neo4jStorage", "FalkorDBStorage"] \ No newline at end of file diff --git a/itext2kg/graph_integration/falkordb_storage.py b/itext2kg/graph_integration/falkordb_storage.py new file mode 100644 index 0000000..3efc073 --- /dev/null +++ b/itext2kg/graph_integration/falkordb_storage.py @@ -0,0 +1,359 @@ +from falkordb.falkordb import FalkorDB +from falkordb import Graph as FalkorGraph +import numpy as np +from typing import List, Dict, Optional, Any +from itext2kg.models import KnowledgeGraph +from itext2kg.logging_config import get_logger + +logger = get_logger(__name__) + +class FalkorDBStorage: + """ + A class to integrate and manage graph data in a FalkorDB database using the official client. + Enhanced to match Neo4j integration functionality. + """ + def __init__(self, host: str, port: int, password: Optional[str], graph_name: str): + """ + Initializes the FalkorDBStorage with host, port, and target graph name. + + Args: + host (str): Hostname or IP of the FalkorDB server. + port (int): Port for the FalkorDB server. + password (str): Password for database access (if any). + graph_name (str): Name of the graph to write to. + """ + try: + self.client = FalkorDB(host=host, port=port, password=password) + logger.debug("Connected FalkorDB client to %s:%d", host, port) + self.graph = FalkorGraph(self.client, graph_name) + self.graph_name = graph_name + logger.info("Initialized FalkorDB storage for graph: %s", graph_name) + except Exception as e: + logger.error("Failed to connect to FalkorDB at %s:%d - %s", host, port, e) + raise + + def connect(self): + """ + Returns the existing connection (FalkorDB maintains connection internally). + + Returns: + The FalkorDB client instance. + """ + return self.client + + def run_query(self, query: str): + """ + Runs a Cypher query against the FalkorDB database. + + Args: + query (str): The Cypher query to run. + + Returns: + Query result from FalkorDB. + """ + try: + result = self.graph.query(query) + logger.debug("Executed query successfully: %s", query[:100]) + return result + except Exception as e: + logger.error("Failed to execute query '%s': %s", query[:100], e) + raise + + @staticmethod + def transform_embeddings_to_str_list(embeddings: np.ndarray) -> str: + """ + Transforms a NumPy array of embeddings into a comma-separated string. + + Args: + embeddings (np.array): An array of embeddings. + + Returns: + str: A comma-separated string of embeddings. + """ + if embeddings is None: + return "" + return ",".join(list(embeddings.astype("str"))) + + @staticmethod + def transform_str_list_to_embeddings(embeddings: str) -> np.ndarray: + """ + Transforms a comma-separated string of embeddings back into a NumPy array. + + Args: + embeddings (str): A comma-separated string of embeddings. + + Returns: + np.array: A NumPy array of embeddings. + """ + if embeddings is None or embeddings == "": + return np.array([]) + return np.array(embeddings.split(",")).astype(np.float64) + + @staticmethod + def escape_str(s: str) -> str: + """ + Escapes double quotes and backslashes in a string for safe insertion into a Cypher query. + """ + return s.replace('\\', '\\\\').replace('"', '\\"').replace("'", "\\'") + + @staticmethod + def format_value(value: Any) -> str: + """ + Converts a value to a string and escapes it for safe Cypher insertion. + """ + return FalkorDBStorage.escape_str(str(value)) + + @staticmethod + def format_property_value(key: str, value: Any) -> str: + """ + Formats a property value for safe Cypher insertion, handling different data types. + + Args: + key (str): The property key name + value: The property value to format + + Returns: + str: A formatted string for Cypher query + """ + if key == "embeddings": + return f'"{FalkorDBStorage.transform_embeddings_to_str_list(value)}"' + elif isinstance(value, list): + if not value: # Empty list + return "[]" + formatted_items = [] + for item in value: + if isinstance(item, str): + escaped_item = FalkorDBStorage.escape_str(item) + formatted_items.append(f'"{escaped_item}"') + elif isinstance(item, (int, float)): + formatted_items.append(str(item)) + else: + escaped_item = FalkorDBStorage.escape_str(str(item)) + formatted_items.append(f'"{escaped_item}"') + return f"[{', '.join(formatted_items)}]" + elif isinstance(value, (int, float)): + return str(value) + elif isinstance(value, bool): + return str(value).lower() + else: + return f'"{FalkorDBStorage.format_value(value)}"' + + def create_nodes(self, knowledge_graph: KnowledgeGraph) -> List[str]: + """ + Creates or merges nodes in FalkorDB based on a KnowledgeGraph object. + Returns list of executed queries for logging/debugging. + + Args: + knowledge_graph (KnowledgeGraph): The graph with entities defined. + + Returns: + List[str]: List of executed Cypher queries. + """ + queries = [] + for node in knowledge_graph.entities: + try: + # Build properties for MERGE + node_name = FalkorDBStorage.format_value(node.name) + node_label = node.label + + # Build SET clauses for properties + properties = [] + props = node.properties.model_dump() + + for prop, value in props.items(): + if value is None: + continue + prop_key = prop.replace(" ", "_") + if prop == "embeddings": + value_str = FalkorDBStorage.transform_embeddings_to_str_list(value) + properties.append(f'SET n.{prop_key} = "{value_str}"') + elif isinstance(value, (int, float)): + properties.append(f'SET n.{prop_key} = {value}') + elif isinstance(value, list): + formatted_value = FalkorDBStorage.format_property_value(prop, value) + properties.append(f'SET n.{prop_key} = {formatted_value}') + else: + value_str = FalkorDBStorage.format_value(value) + properties.append(f'SET n.{prop_key} = "{value_str}"') + + # Build final query + set_clause = ' '.join(properties) if properties else '' + query = f'MERGE (n:{node_label} {{name: "{node_name}"}}) {set_clause}' + + self.run_query(query) + queries.append(query) + logger.debug("Created/merged node: %s:%s", node_label, node.name) + + except Exception as e: + logger.error("Failed to create/merge node '%s': %s", node.name, e) + raise + + return queries + + def create_relationships(self, knowledge_graph: KnowledgeGraph) -> List[str]: + """ + Creates or merges relationships in FalkorDB based on a KnowledgeGraph object. + Returns list of executed queries for logging/debugging. + + Args: + knowledge_graph (KnowledgeGraph): The graph with relationships defined. + + Returns: + List[str]: List of executed Cypher queries. + """ + queries = [] + for rel in knowledge_graph.relationships: + try: + start = rel.startEntity + end = rel.endEntity + + # Format node identifiers + start_name = FalkorDBStorage.format_value(start.name) + end_name = FalkorDBStorage.format_value(end.name) + + # Build property statements + property_statements = [] + props = rel.properties.model_dump() + + for key, value in props.items(): + if value is None: + continue + formatted_value = FalkorDBStorage.format_property_value(key, value) + property_key = key.replace(" ", "_") + property_statements.append(f'r.{property_key} = {formatted_value}') + + # Build SET clause + set_clause = f'SET {", ".join(property_statements)}' if property_statements else '' + + # Create relationship with ON CREATE and ON MATCH clauses for property handling + query = ( + f'MATCH (a:{start.label} {{name: "{start_name}"}}), ' + f'(b:{end.label} {{name: "{end_name}"}}) ' + f'MERGE (a)-[r:{rel.name}]->(b) ' + f'ON CREATE {set_clause} ' + f'ON MATCH {set_clause}' + ) if set_clause else ( + f'MATCH (a:{start.label} {{name: "{start_name}"}}), ' + f'(b:{end.label} {{name: "{end_name}"}}) ' + f'MERGE (a)-[r:{rel.name}]->(b)' + ) + + self.run_query(query) + queries.append(query) + logger.debug("Created/merged relationship: %s -[%s]-> %s", + start.name, rel.name, end.name) + + except Exception as e: + logger.error("Failed to create/merge relationship '%s': %s", rel.name, e) + raise + + return queries + + def visualize_graph(self, knowledge_graph: KnowledgeGraph, parent_node_type: str = "Document") -> None: + """ + Runs the necessary queries to visualize a graph structure from a KnowledgeGraph input. + Also creates HAS_ENTITY relationships between existing nodes and knowledge graph entities. + + Args: + knowledge_graph (KnowledgeGraph): The KnowledgeGraph object containing the graph structure. + parent_node_type (str): The type of parent nodes to create HAS_ENTITY relationships with. + """ + logger.info("Visualizing graph '%s' in FalkorDB: %d nodes, %d relationships", + self.graph_name, + len(knowledge_graph.entities), + len(knowledge_graph.relationships)) + + try: + # Create nodes first + node_queries = self.create_nodes(knowledge_graph) + logger.info("Created %d nodes", len(node_queries)) + + # Then create relationships + rel_queries = self.create_relationships(knowledge_graph) + logger.info("Created %d relationships", len(rel_queries)) + + # Optionally create parent relationships + if parent_node_type: + self._create_parent_relationships(knowledge_graph, parent_node_type) + + logger.info("Graph visualization completed successfully") + + except Exception as e: + logger.error("Failed to visualize graph: %s", e) + raise + + def _create_parent_relationships(self, knowledge_graph: KnowledgeGraph, parent_node_type: str) -> None: + """ + Creates HAS_ENTITY relationships between parent nodes and entities in the knowledge graph. + + Args: + knowledge_graph (KnowledgeGraph): The knowledge graph containing entities + parent_node_type (str): The label of parent nodes to link to + """ + try: + for entity in knowledge_graph.entities: + entity_name = FalkorDBStorage.format_value(entity.name) + query = ( + f'MATCH (parent:{parent_node_type}), (entity:{entity.label} {{name: "{entity_name}"}}) ' + f'MERGE (parent)-[r:HAS_ENTITY]->(entity)' + ) + self.run_query(query) + + logger.debug("Created parent relationships with %s nodes", parent_node_type) + + except Exception as e: + logger.warning("Failed to create parent relationships: %s", e) + + def clear_graph(self) -> None: + """ + Clears all nodes and relationships from the current graph. + """ + try: + self.run_query("MATCH (n) DETACH DELETE n") + logger.info("Cleared all data from graph: %s", self.graph_name) + except Exception as e: + logger.error("Failed to clear graph: %s", e) + raise + + def get_graph_stats(self) -> Dict[str, int]: + """ + Returns basic statistics about the current graph. + + Returns: + Dict[str, int]: Dictionary containing node and relationship counts + """ + try: + # Get node count + node_result = self.run_query("MATCH (n) RETURN count(n) as node_count") + node_count = 0 + if hasattr(node_result, 'result_set') and node_result.result_set: + node_count = node_result.result_set[0][0] if len(node_result.result_set) > 0 and len(node_result.result_set[0]) > 0 else 0 + + # Get relationship count + rel_result = self.run_query("MATCH ()-[r]->() RETURN count(r) as rel_count") + rel_count = 0 + if hasattr(rel_result, 'result_set') and rel_result.result_set: + rel_count = rel_result.result_set[0][0] if len(rel_result.result_set) > 0 and len(rel_result.result_set[0]) > 0 else 0 + + stats = { + 'nodes': int(node_count) if node_count is not None else 0, + 'relationships': int(rel_count) if rel_count is not None else 0 + } + + logger.info("Graph stats: %s", stats) + return stats + + except Exception as e: + logger.error("Failed to get graph statistics: %s", e) + return {'nodes': 0, 'relationships': 0} + + def close(self) -> None: + """ + Closes the database connection. + """ + try: + if hasattr(self.client, 'close'): + self.client.close() + logger.info("Closed FalkorDB connection") + except Exception as e: + logger.error("Error closing FalkorDB connection: %s", e) diff --git a/requirements.txt b/requirements.txt index 76c62fb..b572121 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,6 +10,7 @@ langchain-openai~=0.2.0 # Database and API clients neo4j>=5.28.0,<6.0.0 +falkordb>=1.0.0 # Add FalkorDB support openai>=1.97.0,<2.0.0 # Data processing and scientific computing diff --git a/setup.cfg b/setup.cfg index a882319..a318a32 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,12 +1,12 @@ [metadata] name = itext2kg -version = 0.0.8 +version = 0.0.9 author = Auvalab - Yassir LAIRGI author_email = description = Incremental Knowledge Graphs Constructor Using Large Language Models long_description = file: README.md long_description_content_type = text/markdown -keywords = kg construction, llms, neo4j, graphs +keywords = kg construction, llms, neo4j, falkordb, graphs classifiers = Development Status :: 4 - Beta Intended Audience :: Developers @@ -28,6 +28,7 @@ install_requires = langchain-core>=0.3.69,<0.4.0 langchain-openai>=0.2.0,<0.3.0 neo4j>=5.28.0,<6.0.0 + falkordb>=1.0.0 openai>=1.97.0,<2.0.0 numpy>=1.24.0,<2.0.0 scikit-learn>=1.7.0,<2.0.0