Add MariaDB integration with native VECTOR(384) support

imash · imash · commit 848595305eab · 2025-10-20T17:44:07.000+05:30
- Add MariaDB database connector using official mariadb Python connector
- Add MariaDB Vector Store with native VECTOR(384) data type
- Improve DDL retrieval with vector search prioritization
- Enhance SQL extraction and prompt engineering
- Unify embedding model (all-MiniLM-L6-v2) across all vectorstores
- Add configurable model support for Google Gemini
- Update README with MariaDB documentation

Features:
- Native VECTOR(384) support in MariaDB 10.7+
- Hybrid vector-relational storage
- FULLTEXT indexing for enhanced search
- Backward compatible with existing code
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # 🧠 MindSQL
 
-MindSQL is a Python RAG (Retrieval-Augmented Generation) Library designed to streamline the interaction between users and their databases using just a few lines of code. With seamless integration for renowned databases such as PostgreSQL, MySQL, and SQLite, MindSQL also extends its capabilities to major databases like Snowflake and BigQuery by extending the `IDatabase` Interface. This library utilizes large language models (LLM) like GPT-4, Llama 2, Google Gemini, and supports knowledge bases like ChromaDB and Faiss.
+MindSQL is a Python RAG (Retrieval-Augmented Generation) Library designed to streamline the interaction between users and their databases using just a few lines of code. With seamless integration for renowned databases such as PostgreSQL, MySQL, MariaDB, and SQLite, MindSQL also extends its capabilities to major databases like Snowflake and BigQuery by extending the `IDatabase` Interface. This library utilizes large language models (LLM) like GPT-4, Llama 2, Google Gemini, and supports vector stores like ChromaDB, FAISS, Qdrant, and MariaDB Vector (with native VECTOR data type support).
 
 ![MindSQL Chart](https://github.com/Sammindinventory/MindSQL/assets/77489054/bc993117-8da9-4b4f-b217-8a33db65c342)
 
@@ -107,7 +107,4 @@ We value your feedback and strive to improve MindSQL. Here's how you can share y
 - Open an issue to provide general feedback, suggestions, or comments.
 - Be constructive and specific in your feedback to help us understand your perspective better.
 
-Thank you for your interest in contributing to our project! We appreciate your support and look forward to working with you. 🚀
-
-
-
+Thank you for your interest in contributing to our project! We appreciate your support and look forward to working with you. 🚀
diff --git a/mindsql/_helper/helper.py b/mindsql/_helper/helper.py
@@ -62,13 +62,26 @@ def log_and_return(extracted_sql: str) -> str:
         log.info(LOG_AND_RETURN_CONSTANT.format(llm_response, extracted_sql))
         return extracted_sql
 
+    # Check for SQLQuery: label (common LLM format)
+    if "SQLQuery:" in llm_response:
+        # Extract everything after SQLQuery:
+        sql_part = llm_response.split("SQLQuery:", 1)[1].strip()
+        # Remove any trailing text after the query
+        if "\n\n" in sql_part:
+            sql_part = sql_part.split("\n\n")[0].strip()
+        return log_and_return(sql_part)
+    
+    # Check for SQL in code blocks
     sql_match = re.search(r"```(sql)?\n(.+?)```", llm_response, re.DOTALL)
     if sql_match:
         return log_and_return(sql_match.group(2).replace("`", ""))
+    
+    # Check for SELECT statements
     elif has_select_and_semicolon(llm_response):
         start_sql = llm_response.find("SELECT")
         end_sql = llm_response.find(";")
         return log_and_return(llm_response[start_sql:end_sql + 1].replace("`", ""))
+    
     return llm_response
 
 
diff --git a/mindsql/_utils/constants.py b/mindsql/_utils/constants.py
@@ -18,6 +18,9 @@
 MYSQL_SHOW_DATABASE_QUERY = "SHOW DATABASES;"
 MYSQL_DB_TABLES_INFO_SCHEMA_QUERY = "SELECT table_name FROM information_schema.tables WHERE table_schema = '{}';"
 MYSQL_SHOW_CREATE_TABLE_QUERY = "SHOW CREATE TABLE `{}`;"
+MARIADB_SHOW_DATABASE_QUERY = "SHOW DATABASES;"
+MARIADB_DB_TABLES_INFO_SCHEMA_QUERY = "SELECT table_name FROM information_schema.tables WHERE table_schema = '{}';"
+MARIADB_SHOW_CREATE_TABLE_QUERY = "SHOW CREATE TABLE `{}`;"
 POSTGRESQL_SHOW_DATABASE_QUERY = "SELECT datname as DATABASE_NAME FROM pg_database WHERE datistemplate = false;"
 POSTGRESQL_DB_TABLES_INFO_SCHEMA_QUERY = "SELECT table_name FROM information_schema.tables WHERE table_schema = 'public' AND table_catalog = '{db}';"
 ERROR_DOWNLOADING_SQLITE_DB_CONSTANT = "Error downloading sqlite db: {}"
diff --git a/mindsql/_utils/prompts.py b/mindsql/_utils/prompts.py
@@ -1,6 +1,18 @@
-DEFAULT_PROMPT: str = """As a {dialect_name} expert, your task is to generate SQL queries based on user questions. Ensure that your {dialect_name} queries are syntactically correct and tailored to the user's inquiry. Retrieve at most 10 results using the LIMIT clause and order them for relevance. Avoid querying for all columns from a table. Select only the necessary columns wrapped in backticks (`). Use CURDATE() to handle 'today' queries and employ the LIKE clause for precise matches in {dialect_name}. Carefully consider column names and their respective tables to avoid querying non-existent columns. Stop after delivering the SQLQuery, avoiding follow-up questions.
-
-Follow this format:
+DEFAULT_PROMPT: str = """As a {dialect_name} expert, your task is to generate accurate SQL queries based on user questions and the provided table schemas.
+
+CRITICAL INSTRUCTIONS:
+1. Carefully analyze the table schemas provided in the DDL statements below
+2. When user asks to "show TABLE_NAME" or "display TABLE_NAME table", select from that specific table
+3. Examine which columns exist in which tables - only query columns that actually exist
+4. Select meaningful, relevant columns that answer the user's question (avoid unnecessary columns)
+5. Use backticks (`) to wrap table and column names for {dialect_name} compatibility
+6. Add LIMIT clause (maximum 10 rows) to prevent excessive results
+7. Use ORDER BY to organize results logically
+8. Use CURDATE() function for queries involving "today"
+9. Match filter values exactly as they appear in the schema (case-sensitive)
+10. Double-check your table and column names against the provided DDL before generating SQL
+
+Follow this exact format:
 Question: User's question here
 SQLQuery: Your SQL query without preamble
 
@@ -54,6 +66,5 @@
     - Ensure that the code is well-commented for readability and syntactically correct.
     """
 
-SQL_EXCEPTION_RESPONSE = """Apologies for the inconvenience! 🙏 It seems the database is currently experiencing a bit 
-of a hiccup and isn't cooperating as we'd like. 🤖"""
+SQL_EXCEPTION_RESPONSE = """Apologies for the inconvenience! It seems the database is currently experiencing a bit of a hiccup and isn't cooperating as we'd like."""
 
diff --git a/mindsql/core/mindsql_core.py b/mindsql/core/mindsql_core.py
@@ -49,7 +49,7 @@ def create_database_query(self, question: str, connection, tables: list, **kwarg
         question_sql_list = self.vectorstore.retrieve_relevant_question_sql(question, **kwargs)
         prompt = self.build_sql_prompt(question=question, connection=connection, question_sql_list=question_sql_list,
                                        tables=tables, **kwargs)
-        log.info(prompt)
+        # log.info(prompt)  # Don't show full prompt to users
         llm_response = self.llm.invoke(prompt, **kwargs)
         return _helper.helper.extract_sql(llm_response)
 
@@ -176,13 +176,28 @@ def __get_ddl_statements(self, connection: any, tables: list[str], question: str
         Returns:
             list[str]: The list of DDL statements.
         """
+        # Try vector store first (semantic search - best for finding relevant tables)
+        vector_ddls = []
+        try:
+            vector_ddls = self.vectorstore.retrieve_relevant_ddl(question, **kwargs)
+        except Exception as e:
+            log.info(f"Vector store retrieval failed: {e}")
+        
+        # If vector store returns good results, use them
+        if vector_ddls and len(vector_ddls) > 0:
+            return vector_ddls
+        
+        # Fallback: get all DDLs from database if vector store fails
         if tables and connection:
             ddl_statements = []
             for table_name in tables:
-                ddl_statements.append(self.database.get_ddl(connection=connection, table_name=table_name))
-        else:
-            ddl_statements = self.vectorstore.retrieve_relevant_ddl(question, **kwargs)
-        return ddl_statements
+                try:
+                    ddl_statements.append(self.database.get_ddl(connection=connection, table_name=table_name))
+                except Exception as e:
+                    log.info(f"Failed to get DDL for table {table_name}: {e}")
+            return ddl_statements
+        
+        return []
 
     def ask_db(self, connection, question: Union[str, None] = None, table_names: list = None, visualize: bool = False,
                **kwargs) -> dict:
diff --git a/mindsql/databases/__init__.py b/mindsql/databases/__init__.py
@@ -1,4 +1,5 @@
 from .idatabase import IDatabase
+from .mariadb import MariaDB
 from .mysql import MySql
 from .postgres import Postgres
 from .sqlite import Sqlite
diff --git a/mindsql/databases/mariadb.py b/mindsql/databases/mariadb.py
@@ -0,0 +1,187 @@
+from typing import List
+from urllib.parse import urlparse
+
+import mariadb
+import pandas as pd
+
+from .._utils import logger
+from .._utils.constants import SUCCESSFULLY_CONNECTED_TO_DB_CONSTANT, ERROR_CONNECTING_TO_DB_CONSTANT, \
+    INVALID_DB_CONNECTION_OBJECT, ERROR_WHILE_RUNNING_QUERY, MARIADB_DB_TABLES_INFO_SCHEMA_QUERY, \
+    MARIADB_SHOW_DATABASE_QUERY, MARIADB_SHOW_CREATE_TABLE_QUERY, CONNECTION_ESTABLISH_ERROR_CONSTANT
+from . import IDatabase
+
+log = logger.init_loggers("MariaDB")
+
+
+class MariaDB(IDatabase):
+    def create_connection(self, url: str, **kwargs) -> any:
+        """
+        A method to create a connection with MariaDB database.
+
+        Parameters:
+            url (str): The URL in the format mariadb://username:password@host:port/database_name
+            **kwargs: Additional keyword arguments for the connection.
+
+        Returns:
+            any: The connection object.
+        """
+        url = urlparse(url)
+        try:
+            # Use official MariaDB connector
+            connection_params = {
+                'host': url.hostname,
+                'port': url.port or int(kwargs.get('port', 3306)),
+                'user': url.username,
+                'password': url.password,
+                'database': url.path.lstrip('/') if url.path else None,
+                'autocommit': True,
+            }
+            
+            # Remove None values and add any additional kwargs
+            connection_params = {k: v for k, v in connection_params.items() if v is not None}
+            connection_params.update({k: v for k, v in kwargs.items() if k not in ['port']})
+            
+            conn = mariadb.connect(**connection_params)
+            
+            log.info(SUCCESSFULLY_CONNECTED_TO_DB_CONSTANT.format("MariaDB"))
+            return conn
+
+        except mariadb.Error as e:
+            error_msg = str(e)
+            log.info(ERROR_CONNECTING_TO_DB_CONSTANT.format("MariaDB", error_msg))
+            return None
+
+    def validate_connection(self, connection: any) -> None:
+        """
+        A function that validates if the provided connection is a MariaDB connection.
+
+        Parameters:
+            connection: The connection object for accessing the database.
+
+        Raises:
+            ValueError: If the provided connection is not a MariaDB connection.
+
+        Returns:
+            None
+        """
+        if connection is None:
+            raise ValueError(CONNECTION_ESTABLISH_ERROR_CONSTANT)
+
+        # MariaDB connection validation (using PyMySQL connection)
+        if not hasattr(connection, 'cursor'):
+            raise ValueError(INVALID_DB_CONNECTION_OBJECT.format("MariaDB"))
+
+    def execute_sql(self, connection, sql: str) -> pd.DataFrame:
+        """
+        A method to execute SQL on the database.
+
+        Parameters:
+            connection (any): The connection object.
+            sql (str): The SQL to be executed.
+
+        Returns:
+            pd.DataFrame: The result of the SQL query.
+        """
+        try:
+            self.validate_connection(connection)
+            cursor = connection.cursor()
+            cursor.execute(sql)
+            
+            # For DDL/DML statements (CREATE, INSERT, UPDATE, DELETE), commit and return empty DataFrame
+            if sql.strip().upper().startswith(('CREATE', 'INSERT', 'UPDATE', 'DELETE', 'DROP', 'ALTER')):
+                connection.commit()
+                cursor.close()
+                return pd.DataFrame()
+            
+            # For SELECT statements, fetch results
+            results = cursor.fetchall()
+            if cursor.description:
+                column_names = [i[0] for i in cursor.description]
+                df = pd.DataFrame(results, columns=column_names)
+            else:
+                df = pd.DataFrame()
+            cursor.close()
+            return df
+        except mariadb.Error as e:
+            log.info(ERROR_WHILE_RUNNING_QUERY.format(e))
+            return pd.DataFrame()
+
+    def get_databases(self, connection) -> List[str]:
+        """
+        Get a list of databases from the given connection and SQL query.
+
+        Parameters:
+            connection (object): The connection object for the database.
+
+        Returns:
+            List[str]: A list of unique database names.
+        """
+        try:
+            self.validate_connection(connection)
+            df_databases = self.execute_sql(connection=connection, sql=MARIADB_SHOW_DATABASE_QUERY)
+        except Exception as e:
+            log.info(e)
+            return []
+
+        return df_databases["Database"].unique().tolist()
+
+    def get_table_names(self, connection, database: str) -> pd.DataFrame:
+        """
+        Retrieves the tables from the information schema for the specified database.
+
+        Parameters:
+            connection: The database connection object.
+            database (str): The name of the database.
+
+        Returns:
+            DataFrame: A pandas DataFrame containing the table names from the information schema.
+        """
+        self.validate_connection(connection)
+        df_tables = self.execute_sql(connection, MARIADB_DB_TABLES_INFO_SCHEMA_QUERY.format(database))
+        return df_tables
+
+    def get_all_ddls(self, connection, database: str) -> pd.DataFrame:
+        """
+        Get all DDLs from the specified database using the provided connection object.
+
+        Parameters:
+            connection (any): The connection object.
+            database (str): The name of the database.
+
+        Returns:
+            pd.DataFrame: A pandas DataFrame containing the DDLs for each table in the specified database.
+        """
+        self.validate_connection(connection)
+        df_tables = self.get_table_names(connection, database)
+        df_ddl = pd.DataFrame(columns=['Table', 'DDL'])
+        for index, row in df_tables.iterrows():
+            # Handle both uppercase and lowercase column names
+            table_name = row.get('TABLE_NAME') or row.get('table_name')
+            if table_name:
+                ddl_df = self.get_ddl(connection, table_name)
+                df_ddl = df_ddl._append({'Table': table_name, 'DDL': ddl_df}, ignore_index=True)
+        return df_ddl
+
+    def get_ddl(self, connection: any, table_name: str, **kwargs) -> str:
+        """
+        A method to get the DDL for the table.
+
+        Parameters:
+            connection (any): The connection object.
+            table_name (str): The name of the table.
+            **kwargs: Additional keyword arguments.
+
+        Returns:
+            str: The DDL for the table.
+        """
+        ddl_df = self.execute_sql(connection, MARIADB_SHOW_CREATE_TABLE_QUERY.format(table_name))
+        return ddl_df["Create Table"].iloc[0]
+
+    def get_dialect(self) -> str:
+        """
+        A method to get the dialect of the database.
+
+        Returns:
+            str: The dialect of the database.
+        """
+        return 'mysql'
diff --git a/mindsql/llms/googlegenai.py b/mindsql/llms/googlegenai.py
@@ -22,7 +22,12 @@ def __init__(self, config=None):
             raise ValueError(GOOGLE_GEN_AI_APIKEY_ERROR)
         api_key = config.pop('api_key')
         genai.configure(api_key=api_key)
-        self.model = genai.GenerativeModel('gemini-pro', **config)
+        
+        # Get model name from config, default to gemini-1.5-flash
+        model_name = config.pop('model', 'gemini-1.5-flash')
+        # Store temperature for later use if provided
+        self.default_temperature = config.pop('temperature', 0.1)
+        self.model = genai.GenerativeModel(model_name, **config)
 
     def system_message(self, message: str) -> any:
         """
@@ -75,7 +80,7 @@ def invoke(self, prompt, **kwargs) -> str:
         if prompt is None or len(prompt) == 0:
             raise Exception("Prompt cannot be empty.")
 
-        temperature = kwargs.get("temperature", 0.1)
+        temperature = kwargs.get("temperature", self.default_temperature)
         response = self.model.generate_content(prompt,
                                                generation_config=genai.GenerationConfig(temperature=temperature))
         return response.text
diff --git a/mindsql/vectorstores/__init__.py b/mindsql/vectorstores/__init__.py
@@ -2,3 +2,4 @@
 from .chromadb import ChromaDB
 from .faiss_db import Faiss
 from .qdrant import Qdrant
+from .mariadb_vector import MariaDBVectorStore
diff --git a/mindsql/vectorstores/chromadb.py b/mindsql/vectorstores/chromadb.py
@@ -10,7 +10,7 @@
 
 from . import IVectorstore
 
-sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="WhereIsAI/UAE-Large-V1")
+sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="all-MiniLM-L6-v2")
 
 
 class ChromaDB(IVectorstore):
diff --git a/mindsql/vectorstores/faiss_db.py b/mindsql/vectorstores/faiss_db.py
@@ -9,7 +9,7 @@
 
 from . import IVectorstore
 
-sentence_transformer_ef = SentenceTransformer("WhereIsAI/UAE-Large-V1")
+sentence_transformer_ef = SentenceTransformer("all-MiniLM-L6-v2")
 
 
 class Faiss(IVectorstore):
diff --git a/mindsql/vectorstores/mariadb_vector.py b/mindsql/vectorstores/mariadb_vector.py
diff --git a/mindsql/vectorstores/qdrant.py b/mindsql/vectorstores/qdrant.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`from .idatabase import IDatabase`
	`2`	`+from .mariadb import MariaDB`
`2`	`3`	`from .mysql import MySql`
`3`	`4`	`from .postgres import Postgres`
`4`	`5`	`from .sqlite import Sqlite`