Add MindsDB inference example and utility functions

dusvyat · dusvyat · commit bbfee91c55ea · 2024-05-16T12:52:25.000+03:00
This commit adds an example script showing usage of MindsDB for inference using external tools. It also introduces two utility functions in 'mindsdb_sdk' package to help in interpreting functions and obtaining database table schemas. These updates aim to make it easier for developers to leverage MindsDB in their ML projects.
diff --git a/examples/using_mindsdb_llm_inference_with_tools.py b/examples/using_mindsdb_llm_inference_with_tools.py
@@ -0,0 +1,116 @@
+from openai import OpenAI, OpenAIError
+from mindsdb_sdk.utils.table_schema import get_table_schemas
+from mindsdb_sdk.utils.openai import make_openai_tool
+import mindsdb_sdk
+import os
+
+# generate the key at https://llm.mdb.ai
+MINDSDB_API_KEY = os.environ.get("MINDSDB_API_KEY")
+OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
+
+MODEL = "gpt-3.5-turbo"
+# text2sql prompt here (e.g. "What is the average satisfaction of passengers in the airline_passenger_satisfaction table?")
+SYSTEM_PROMPT = """You are a SQL expert. Given an input question, first create a syntactically correct SQL query to run, 
+then look at the results of the query and return the answer to the input question.
+Unless the user specifies in the question a specific number of examples to obtain, query for at most 5 results using the 
+LIMIT clause as per SQL standards. You can order the results to return the most informative data in the database.
+Never query for all columns from a table. You must query only the columns that are needed to answer the question. 
+Wrap each column name in backticks (`) to denote them as identifiers.
+Pay attention to use only the column names you can see in the tables below. 
+Be careful to not query for columns that do not exist. Also, pay attention to which column is in which table.
+Pay attention to use CURRENT_DATE function to get the current date, if the question involves "today".
+
+Use the following format:
+
+Question: <Question here>
+SQLQuery: <SQL Query to run>
+SQLResult: <Result of the SQLQuery>
+Answer: <Final answer here>
+
+Only use the following tables:
+
+{schema}
+"""
+PROMPT = "what was the average delay on arrivals?"
+
+
+def generate_system_prompt(system_prompt, schema):
+    prompt = {
+        "role": "system",
+        "content": system_prompt.format(schema=schema)
+    }
+    return prompt
+
+
+def generate_user_prompt(query):
+    prompt = {
+        "role": "user",
+        "content": query
+    }
+    return prompt
+
+
+def extract_sql_query(result):
+    # Split the result into lines
+    lines = result.split('\n')
+
+    # Initialize an empty string to hold the query
+    query = ""
+
+    # Initialize a flag to indicate whether we're currently reading the query
+    reading_query = False
+
+    # Iterate over the lines
+    for line in lines:
+        # If the line starts with "SQLQuery:", start reading the query
+        if line.startswith("SQLQuery:"):
+            query = line[len("SQLQuery:"):].strip()
+            reading_query = True
+        # If the line starts with "SQLResult:", stop reading the query
+        elif line.startswith("SQLResult:"):
+            break
+        # If we're currently reading the query, append the line to the query
+        elif reading_query:
+            query += " " + line.strip()
+
+    # If no line starts with "SQLQuery:", return None
+    if query == "":
+        return None
+
+    return query
+
+
+con = mindsdb_sdk.connect()
+
+database = con.databases.get(name="example_db")
+schema = get_table_schemas(database, included_tables=["airline_passenger_satisfaction"])
+
+try:
+    # client_mindsdb_serve = OpenAI(
+    #     api_key=MINDSDB_API_KEY,
+    #     base_url="https://llm.mdb.ai"
+    # )
+
+    client_mindsdb_serve = OpenAI(
+        api_key=OPENAI_API_KEY
+    )
+
+    chat_completion_gpt = client_mindsdb_serve.chat.completions.create(
+        messages=[
+            generate_system_prompt(SYSTEM_PROMPT, schema),
+            generate_user_prompt(PROMPT)
+        ],
+        model=MODEL
+    )
+
+    response = chat_completion_gpt.choices[0].message.content
+
+    query = extract_sql_query(response)
+
+    print(f"Generated SQL query: {query}")
+
+except OpenAIError as e:
+    raise OpenAIError(f"An error occurred with the MindsDB Serve API: {e}")
+
+result = database.query(query).fetch()
+print(result)
diff --git a/mindsdb_sdk/utils/openai.py b/mindsdb_sdk/utils/openai.py
@@ -0,0 +1,51 @@
+
+import inspect
+import docstring_parser
+
+
+def make_openai_tool(function: callable):
+    """
+    Make an OpenAI tool for a function
+
+    :param function: function to generate metadata for
+    :return: dictionary containing function metadata
+    """
+    params = inspect.signature(function).parameters
+    docstring = docstring_parser.parse(function.__doc__)
+
+    function_dict = {
+        "type":"function",
+        "function":{
+            "name":function.__name__,
+            "description":docstring.short_description,
+            "parameters":{
+                "type":"object",
+                "properties":{},
+                "required":[]
+            }
+        }
+    }
+
+    for name, param in params.items():
+        param_description = next((p.description for p in docstring.params if p.arg_name == name), '')
+
+        # convert annotation type to string
+        if param.annotation is not inspect.Parameter.empty:
+            if inspect.isclass(param.annotation):
+                param_type = param.annotation.__name__
+            else:
+                param_type = str(param.annotation)
+        else:
+            param_type = None
+
+        function_dict["function"]["parameters"]["properties"][name] = {
+            "type":param_type,
+            "description":param_description
+        }
+
+        # Check if parameter is required
+        if param.default == inspect.Parameter.empty:
+            function_dict["function"]["parameters"]["required"].append(name)
+
+    return function_dict
+
diff --git a/mindsdb_sdk/utils/table_schema.py b/mindsdb_sdk/utils/table_schema.py
@@ -0,0 +1,40 @@
+from typing import List
+from mindsdb_sdk.databases import Databases
+
+
+def get_dataframe_schema(df):
+    # Get the dtypes Series
+    try:
+        df = df.convert_dtypes()
+    except Exception as e:
+        raise f"Error converting dtypes: {e}"
+
+    dtypes = df.dtypes
+
+    # Convert the dtypes Series into a list of dictionaries
+    schema = [{"name": column, "type": dtype.name} for column, dtype in dtypes.items()]
+
+    return schema
+
+
+def get_table_schemas(database: Databases, included_tables: List[str] = None):
+    """
+    Get table schemas from a database
+
+    :param database: database object
+    :param included_tables: list of table names to get schemas for
+    :return: dictionary containing table schemas
+    """
+
+    tables = [table.name for table in database.tables.list()]
+
+    if included_tables:
+        tables = [table for table in tables if table in included_tables]
+
+    table_schemas = {}
+    for table in tables:
+        table_df = database.get_table(table).fetch()
+        # Convert schema to list of dictionaries
+        table_schemas[table] = get_dataframe_schema(table_df)
+
+    return table_schemas