Add example using MindsDB for text2SQL tasks and update util functions

dusvyat · dusvyat · commit c4565875211d · 2024-05-16T15:37:20.000+03:00
This commit introduces an example using the MindsDB and OpenAI APIs to perform text2SQL tasks. Also, several utility function improvements are applied: retries for OpenAI chat completion requests, function execution, extracting SQL queries, and pretty print for the conversation history. Lastly, the file `using_mindsdb_llm_inference_with_tools.py` has been renamed to `using_mindsdb_inference_with_text2sql_using_tools.py` for better clarity.
diff --git a/examples/using_mindsdb_inference_with_text2sql_prompt.py b/examples/using_mindsdb_inference_with_text2sql_prompt.py
@@ -1,15 +1,18 @@
 from openai import OpenAI, OpenAIError
-from mindsdb_sdk.utils.table_schema import get_table_schemas
-from mindsdb_sdk.utils.openai import make_openai_tool
+from mindsdb_sdk.utils.openai import extract_sql_query, make_openai_tool, query_database
+
 import mindsdb_sdk
 import os
 
+from mindsdb_sdk.utils.table_schema import get_table_schemas
+
 # generate the key at https://llm.mdb.ai
 MINDSDB_API_KEY = os.environ.get("MINDSDB_API_KEY")
 OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
 
 MODEL = "gpt-3.5-turbo"
-# text2sql prompt here (e.g. "What is the average satisfaction of passengers in the airline_passenger_satisfaction table?")
+
+# the prompt should be a question that can be answered by the database
 SYSTEM_PROMPT = """You are a SQL expert. Given an input question, first create a syntactically correct SQL query to run, 
 then look at the results of the query and return the answer to the input question.
 Unless the user specifies in the question a specific number of examples to obtain, query for at most 5 results using the 
@@ -34,55 +37,28 @@
 PROMPT = "what was the average delay on arrivals?"
 
 
-def generate_system_prompt(system_prompt, schema):
+def generate_system_prompt(system_prompt: str, schema: dict) -> dict:
     prompt = {
         "role": "system",
         "content": system_prompt.format(schema=schema)
     }
     return prompt
 
 
-def generate_user_prompt(query):
+def generate_user_prompt(query: str) -> dict:
     prompt = {
         "role": "user",
         "content": query
     }
     return prompt
 
 
-def extract_sql_query(result):
-    # Split the result into lines
-    lines = result.split('\n')
-
-    # Initialize an empty string to hold the query
-    query = ""
-
-    # Initialize a flag to indicate whether we're currently reading the query
-    reading_query = False
-
-    # Iterate over the lines
-    for line in lines:
-        # If the line starts with "SQLQuery:", start reading the query
-        if line.startswith("SQLQuery:"):
-            query = line[len("SQLQuery:"):].strip()
-            reading_query = True
-        # If the line starts with "SQLResult:", stop reading the query
-        elif line.startswith("SQLResult:"):
-            break
-        # If we're currently reading the query, append the line to the query
-        elif reading_query:
-            query += " " + line.strip()
-
-    # If no line starts with "SQLQuery:", return None
-    if query == "":
-        return None
-
-    return query
-
-
 con = mindsdb_sdk.connect()
 
-database = con.databases.get(name="example_db")
+# given database name, returns schema and database object
+# using example_db from mindsdb
+
+database = con.databases.get("example_db")
 schema = get_table_schemas(database, included_tables=["airline_passenger_satisfaction"])
 
 try:
@@ -95,22 +71,43 @@ def extract_sql_query(result):
         api_key=OPENAI_API_KEY
     )
 
+    messages = [
+        generate_system_prompt(SYSTEM_PROMPT, schema),
+        generate_user_prompt(PROMPT)
+    ]
+
     chat_completion_gpt = client_mindsdb_serve.chat.completions.create(
-        messages=[
-            generate_system_prompt(SYSTEM_PROMPT, schema),
-            generate_user_prompt(PROMPT)
-        ],
+        messages=messages,
         model=MODEL
     )
 
     response = chat_completion_gpt.choices[0].message.content
 
+    # extract the SQL query from the response
     query = extract_sql_query(response)
 
     print(f"Generated SQL query: {query}")
 
 except OpenAIError as e:
     raise OpenAIError(f"An error occurred with the MindsDB Serve API: {e}")
 
-result = database.query(query).fetch()
-print(result)
+result = query_database(database, query)
+
+# format the result to be displayed in the prompt
+query_result = "SQLResult: " + str(result)
+
+# generate the user prompt with the query result, this will be used to generate the final response
+query = generate_user_prompt(query_result)
+
+# add the query to the messages list
+messages.append(query)
+
+# generate the final response
+chat_completion_gpt = client_mindsdb_serve.chat.completions.create(
+    messages=messages,
+    model=MODEL
+)
+
+response = chat_completion_gpt.choices[0].message.content
+
+print(response)
diff --git a/examples/using_mindsdb_inference_with_text2sql_using_tools.py b/examples/using_mindsdb_inference_with_text2sql_using_tools.py
@@ -0,0 +1,71 @@
+from openai import OpenAI
+
+
+from mindsdb_sdk.utils.openai import (
+                                      make_mindsdb_tool,
+                                      execute_function_call,
+                                      chat_completion_request,
+                                      pretty_print_conversation)
+
+import mindsdb_sdk
+import os
+
+from mindsdb_sdk.utils.table_schema import get_table_schemas
+
+# generate the key at https://llm.mdb.ai
+MINDSDB_API_KEY = os.environ.get("MINDSDB_API_KEY")
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+
+MODEL = "gpt-3.5-turbo"
+
+
+con = mindsdb_sdk.connect()
+
+# given database name, returns schema and database object
+# using example_db from mindsdb
+
+# client_mindsdb_serve = OpenAI(
+#     api_key=MINDSDB_API_KEY,
+#     base_url="https://llm.mdb.ai"
+# )
+
+client_mindsdb_serve = OpenAI(
+    api_key=OPENAI_API_KEY
+)
+
+database = con.databases.get("example_db")
+schema = get_table_schemas(database, included_tables=["airline_passenger_satisfaction"])
+
+tools = [make_mindsdb_tool(schema)]
+
+SYSTEM_PROMPT = """You are a SQL expert. Given an input question, Answer user questions by generating SQL queries 
+against the database schema provided in tools 
+Unless the user specifies in the question a specific number of examples to obtain, query for at most 5 results using the 
+LIMIT clause as per SQL standards. You can order the results to return the most informative data in the database.
+Never query for all columns from a table. You must query only the columns that are needed to answer the question. 
+Wrap each column name in backticks (`) to denote them as identifiers.
+Pay attention to use only the column names you can see in the tables below. 
+Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
+Pay attention to use CURRENT_DATE function to get the current date, if the question involves "today"."""
+
+messages = [{
+                "role":"system", "content":SYSTEM_PROMPT
+            }, {"role":"user", "content":"what was the average delay on arrivals?"}]
+
+chat_response = chat_completion_request(client=client_mindsdb_serve, model=MODEL, messages=messages, tools=tools, tool_choice=None)
+
+assistant_message = chat_response.choices[0].message
+
+assistant_message.content = str(assistant_message.tool_calls[0].function)
+
+messages.append({"role": assistant_message.role, "content": assistant_message.content})
+
+if assistant_message.tool_calls:
+    results = execute_function_call(message=assistant_message, database=database)
+    messages.append({
+        "role": "function", "tool_call_id": assistant_message.tool_calls[0].id,
+        "name": assistant_message.tool_calls[0].function.name,
+        "content": results
+    })
+
+pretty_print_conversation(messages)
diff --git a/mindsdb_sdk/utils/openai.py b/mindsdb_sdk/utils/openai.py
@@ -1,23 +1,50 @@
+import json
 
-import inspect
-import docstring_parser
+from mindsdb_sdk.databases import Database
+from tenacity import retry, wait_random_exponential, stop_after_attempt
 
 
-def make_openai_tool(function: callable):
+@retry(wait=wait_random_exponential(multiplier=1, max=40), stop=stop_after_attempt(3))
+def chat_completion_request(client, model, messages, tools=None, tool_choice=None):
+    try:
+        response = client.chat.completions.create(
+            model=model,
+            messages=messages,
+            tools=tools,
+            tool_choice=tool_choice,
+        )
+        return response
+    except Exception as e:
+        print("Unable to generate ChatCompletion response")
+        print(f"Exception: {e}")
+        return e
+
+
+def make_openai_tool(function: callable, description: str = None) -> dict:
     """
-    Make an OpenAI tool for a function
+    Make a generic OpenAI tool for a function
 
     :param function: function to generate metadata for
+    :param description: description of the function
+
     :return: dictionary containing function metadata
     """
+    # You will need to pip install docstring-parser to use this function
+
+    import inspect
+    import docstring_parser
+
     params = inspect.signature(function).parameters
     docstring = docstring_parser.parse(function.__doc__)
 
+    # Get the first line of the docstring as the function description or use the user-provided description
+    function_description = description or docstring.short_description
+
     function_dict = {
         "type":"function",
         "function":{
             "name":function.__name__,
-            "description":docstring.short_description,
+            "description":function_description,
             "parameters":{
                 "type":"object",
                 "properties":{},
@@ -49,3 +76,124 @@ def make_openai_tool(function: callable):
 
     return function_dict
 
+
+def make_mindsdb_tool(schema: dict) -> dict:
+    """
+    Make an OpenAI tool for querying a database connection in MindsDB
+
+    :param schema: database schema
+
+    :return: dictionary containing function metadata for openai tools
+    """
+    return {
+        "type":"function",
+        "function":{
+            "name":"query_database",
+            "description":"Use this function to answer user questions. Input should be a fully formed SQL query.",
+            "parameters":{
+                "type":"object",
+                "properties":{
+                    "query":{
+                        "type":"string",
+                        "description":f"""
+                                    SQL query extracting info to answer the user's question.
+                                    SQL should be written using this database schema:
+                                    {schema}
+                                    The query should be returned in plain text, not in JSON.
+                                    """,
+                    }
+                },
+                "required":["query"],
+            },
+        }
+    }
+
+
+def extract_sql_query(result: str) -> str:
+    """
+    Extract the SQL query from an openai result string
+
+    :param result: OpenAI result string
+    :return: SQL query string
+    """
+    # Split the result into lines
+    lines = result.split('\n')
+
+    # Initialize an empty string to hold the query
+    query = ""
+
+    # Initialize a flag to indicate whether we're currently reading the query
+    reading_query = False
+
+    # Iterate over the lines
+    for line in lines:
+        # If the line starts with "SQLQuery:", start reading the query
+        if line.startswith("SQLQuery:"):
+            query = line[len("SQLQuery:"):].strip()
+            reading_query = True
+        # If the line starts with "SQLResult:", stop reading the query
+        elif line.startswith("SQLResult:"):
+            break
+        # If we're currently reading the query, append the line to the query
+        elif reading_query:
+            query += " " + line.strip()
+
+    # If no line starts with "SQLQuery:", return None
+    if query == "":
+        return None
+
+    return query
+
+
+def query_database(database: Database, query: str) -> str:
+    """
+    Execute a query on a database connection
+
+    :param database: mindsdb Database object
+    :param query: SQL query string
+
+    :return: query results as a string
+    """
+    try:
+        results = str(
+            database.query(query).fetch()
+        )
+    except Exception as e:
+        results = f"query failed with error: {e}"
+    return results
+
+
+def execute_function_call(message, database: Database = None) -> str:
+    """
+    Execute a function call in a message
+
+    """
+    if message.tool_calls[0].function.name == "query_database":
+        query = json.loads(message.tool_calls[0].function.arguments)["query"]
+        results = query_database(database, query)
+    else:
+        results = f"Error: function {message.tool_calls[0].function.name} does not exist"
+    return results
+
+
+def pretty_print_conversation(messages):
+    # you will need to pip install termcolor
+    from termcolor import colored
+    role_to_color = {
+        "system":"red",
+        "user":"green",
+        "assistant":"blue",
+        "function":"magenta",
+    }
+
+    for message in messages:
+        if message["role"] == "system":
+            print(colored(f"system: {message['content']}\n", role_to_color[message["role"]]))
+        elif message["role"] == "user":
+            print(colored(f"user: {message['content']}\n", role_to_color[message["role"]]))
+        elif message["role"] == "assistant" and message.get("function_call"):
+            print(colored(f"assistant: {message['function_call']}\n", role_to_color[message["role"]]))
+        elif message["role"] == "assistant" and not message.get("function_call"):
+            print(colored(f"assistant: {message['content']}\n", role_to_color[message["role"]]))
+        elif message["role"] == "function":
+            print(colored(f"function ({message['name']}): {message['content']}\n", role_to_color[message["role"]]))
diff --git a/mindsdb_sdk/utils/table_schema.py b/mindsdb_sdk/utils/table_schema.py