Merge pull request #1 from mogith-pn/clarifai-dspy-integration

mogith-pn · web-flow · commit d61e15bedcf0 · 2024-01-24T17:47:58.000+05:30
TT-3084-clarifai-dspy-integration
diff --git a/dsp/modules/__init__.py b/dsp/modules/__init__.py
@@ -6,6 +6,7 @@
 from .cohere import *
 from .sbert import *
 from .pyserini import *
+from .clarifai import *
 
 from .hf_client import HFClientTGI
 from .hf_client import Anyscale
diff --git a/dsp/modules/clarifai.py b/dsp/modules/clarifai.py
@@ -0,0 +1,94 @@
+"""Clarifai LM integration"""
+from typing import Any, Optional
+
+from dsp.modules.lm import LM
+
+try:
+    from clarifai.client.model import Model
+except ImportError as err:
+    raise ImportError("ClarifaiLLM requires `pip install clarifai`.") from err
+
+
+class ClarifaiLLM(LM):
+    """Integration to call models hosted in clarifai platform.
+
+    Args:
+        model (str, optional): Clarifai URL of the model. Defaults to "Mistral-7B-Instruct".
+        api_key (Optional[str], optional): CLARIFAI_PAT token. Defaults to None.
+        **kwargs: Additional arguments to pass to the API provider.
+    Example:
+        import dspy
+        dspy.configure(lm=dspy.Clarifai(model=MODEL_URL,
+                                        api_key=CLARIFAI_PAT,
+                                        inference_params={"max_tokens":100,'temperature':0.6}))
+    """
+
+    def __init__(
+        self,
+        model: str = "https://clarifai.com/mistralai/completion/models/mistral-7B-Instruct",
+        api_key: Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(model)
+
+        self.provider = "clarifai"
+        self.pat = api_key
+        self._model = Model(url=model, pat=api_key)
+        self.kwargs = {"n": 1, **kwargs}
+        self.history: list[dict[str, Any]] = []
+        self.kwargs["temperature"] = (
+            self.kwargs["inference_params"]["temperature"]
+            if "inference_params" in self.kwargs
+            and "temperature" in self.kwargs["inference_params"]
+            else 0.0
+        )
+        self.kwargs["max_tokens"] = (
+            self.kwargs["inference_params"]["max_tokens"]
+            if "inference_params" in self.kwargs
+            and "max_tokens" in self.kwargs["inference_params"]
+            else 150
+        )
+
+    def basic_request(self, prompt, **kwargs):
+        params = (
+            self.kwargs["inference_params"] if "inference_params" in self.kwargs else {}
+        )
+        response = (
+            self._model.predict_by_bytes(
+                input_bytes=prompt.encode(encoding="utf-8"),
+                input_type="text",
+                inference_params=params,
+            )
+            .outputs[0]
+            .data.text.raw
+        )
+        kwargs = {**self.kwargs, **kwargs}
+        history = {
+            "prompt": prompt,
+            "response": response,
+            "kwargs": kwargs,
+        }
+        self.history.append(history)
+        return response
+
+    def request(self, prompt: str, **kwargs):
+        return self.basic_request(prompt, **kwargs)
+
+    def __call__(
+        self,
+        prompt: str,
+        only_completed: bool = True,
+        return_sorted: bool = False,
+        **kwargs,
+    ):
+        assert only_completed, "for now"
+        assert return_sorted is False, "for now"
+
+        n = kwargs.pop("n", 1)
+        completions = []
+
+        for i in range(n):
+            response = self.request(prompt, **kwargs)
+            completions.append(response)
+
+        return completions
diff --git a/dsp/modules/lm.py b/dsp/modules/lm.py
@@ -45,14 +45,23 @@ def inspect_history(self, n: int = 1, skip: int = 0):
             prompt = x["prompt"]
 
             if prompt != last_prompt:
-                printed.append(
-                    (
-                        prompt,
-                        x["response"].generations
-                        if provider == "cohere"
-                        else x["response"]["choices"],
+
+                if provider=="clarifai":
+                    printed.append(
+                        (
+                            prompt,
+                            x['response']
+                        ) 
+                    )
+                else:    
+                    printed.append(
+                        (
+                            prompt,
+                            x["response"].generations
+                            if provider == "cohere"
+                            else x["response"]["choices"],
+                        )
                     )
-                )
 
             last_prompt = prompt
 
@@ -71,6 +80,8 @@ def inspect_history(self, n: int = 1, skip: int = 0):
                 text = choices[0].text
             elif provider == "openai" or provider == "ollama":
                 text = ' ' + self._get_choice_text(choices[0]).strip()
+            elif provider == "clarifai":
+                text=choices
             else:
                 text = choices[0]["text"]
             self.print_green(text, end="")
diff --git a/dspy/__init__.py b/dspy/__init__.py
@@ -18,6 +18,7 @@
 OpenAI = dsp.GPT3
 ColBERTv2 = dsp.ColBERTv2
 Pyserini = dsp.PyseriniRetriever
+Clarifai = dsp.ClarifaiLLM
 
 HFClientTGI = dsp.HFClientTGI
 HFClientVLLM = HFClientVLLM
diff --git a/dspy/retrieve/clarifai_rm.py b/dspy/retrieve/clarifai_rm.py
@@ -0,0 +1,97 @@
+"""Clarifai as retriver to retrieve hits"""
+import os
+from concurrent.futures import ThreadPoolExecutor
+from typing import List, Optional, Union
+
+import requests
+
+import dspy
+from dsp.utils import dotdict
+
+try:
+    from clarifai.client.search import Search
+except ImportError as err:
+    raise ImportError(
+        "Clarifai is not installed. Install it using `pip install clarifai`"
+    ) from err
+
+
+class ClarifaiRM(dspy.Retrieve):
+    """
+    Retrieval module uses clarifai to return the Top K relevant pasages for the given query.
+    Assuming that you have ingested the source documents into clarifai App, where it is indexed and stored.
+
+    Args:
+        clarifai_user_id (str): Clarifai unique user_id.
+        clarfiai_app_id (str): Clarifai App ID, where the documents are stored.
+        clarifai_pat (str): Clarifai PAT key.
+        k (int): Top K documents to retrieve.
+
+    Examples:
+        TODO
+    """
+
+    def __init__(
+        self,
+        clarifai_user_id: str,
+        clarfiai_app_id: str,
+        clarifai_pat: Optional[str] = None,
+        k: int = 3,
+    ):
+        self.app_id = clarfiai_app_id
+        self.user_id = clarifai_user_id
+        self.pat = (
+            clarifai_pat if clarifai_pat is not None else os.environ["CLARIFAI_PAT"]
+        )
+        self.k = k
+        self.clarifai_search = Search(
+            user_id=self.user_id, app_id=self.app_id, top_k=k, pat=self.pat
+        )
+        super().__init__(k=k)
+
+    def retrieve_hits(self, hits):
+        header = {"Authorization": f"Key {self.pat}"}
+        request = requests.get(hits.input.data.text.url, headers=header)
+        request.encoding = request.apparent_encoding
+        requested_text = request.text
+        return requested_text
+
+    def forward(
+        self, query_or_queries: Union[str, List[str]], k: Optional[int] = None
+    ) -> dspy.Prediction:
+        """Uses clarifai-python SDK search function and retrieves top_k similar passages for given query,
+        Args:
+             query_or_queries : single query or list of queries
+             k : Top K relevant documents to return
+
+        Returns:
+             passages in format of dotdict
+
+        Examples:
+        Below is a code snippet that shows how to use Marqo as the default retriver:
+         ```python
+         import clarifai
+         llm = dspy.Clarifai(model=MODEL_URL, api_key="YOUR CLARIFAI_PAT")
+         retriever_model = ClarifaiRM(clarifai_user_id="USER_ID", clarfiai_app_id="APP_ID", clarifai_pat="YOUR CLARIFAI_PAT")
+         dspy.settings.configure(lm=llm, rm=retriever_model)
+         ```
+        """
+        queries = (
+            [query_or_queries]
+            if isinstance(query_or_queries, str)
+            else query_or_queries
+        )
+self.clarifai_search.top_k = k if k is not None else self.clarifai_search.top_k
+        passages = []
+        queries = [q for q in queries if q]
+
+        for query in queries:
+            search_response = self.clarifai_search.query(ranks=[{"text_raw": query}])
+
+            # Retrieve hits
+            hits = [hit for data in search_response for hit in data.hits]
+            with ThreadPoolExecutor(max_workers=10) as executor:
+                results = list(executor.map(self.retrieve_hits, hits))
+            passages.extend(dotdict({"long_text": d}) for d in results)
+
+        return passages