Add add document endpoint

2023-06-12 22:30:39 -04:00 · 2023-06-12 22:30:39 -04:00 · 084e1cd383
commit 084e1cd383
parent f9f1982442
12 changed files with 353 additions and 32 deletions
--- a/src/config.py
+++ b/src/config.py
@ -2,6 +2,8 @@ import envyaml
 import os
 import logging
 from logging import Formatter, StreamHandler
+import weaviate
+import json

 config = envyaml.EnvYAML(os.environ.get('CONFIG_PATH', 'config.yaml'))

@ -35,3 +37,15 @@ def initLogger() -> None:
        logging.basicConfig(level=logging.INFO)
        logging.warning("Invalid log level. Using INFO as default")

+def initEnvironment() -> None:
+    os.environ["OPENAI_PROXY"] = config["app.openai.url"]
+    os.environ["OPENAI_API_KEY"] = config["app.openai.api_key"]
+
+def initWeviate() -> None:
+    logging.debug("Initializing Weaviate")
+    client = weaviate.Client(config["app.weaviate.url"])
+    with open(config["app.weaviate.schema-path"]) as file:
+        schema = json.load(file)
+        if not client.schema.contains(schema):
+            logging.debug("Creating Weaviate schema")
+            client.schema.create(schema)
--- a/src/config.yaml
+++ b/src/config.yaml
@ -1,6 +1,18 @@
-weaviate:
-  url: "localhost:"
-
 app:
  log:
-    level: ""
+    level: "debug"
+  weaviate:
+    url: "http://localhost:12345"
+    text-field: "content"
+    index-name: "knowledge"
+    schema-path: "schema.json"
+  openai:
+    url: "http://192.168.1.104:11111"
+    api_key: "sk-"
+  document:
+    # How many characters should each chunked document be split into?
+    split_chunk_size: 500
+    # How much overlap should each chunk have with its neighbor
+    split_chunk_overlap: 20
+    # What model in our OpenAPI api should we use?
+    embeddings_model: "text-embedding"
--- a/src/internal/documentTypes.py
+++ b/src/internal/documentTypes.py
@ -0,0 +1,12 @@
+from enum import Enum
+
+class DocumentType(str, Enum):
+    """
+    Enumerated type for document types that we support
+    """
+    markdown = "md"
+    #html = "html"
+    #pdf = "pdf"
+    #epub = "epub"
+    #odt = "odt"
+    #docx = "docx"
--- a/src/internal/documents.py
+++ b/src/internal/documents.py
@ -1,3 +1,71 @@
+from langchain.document_loaders import UnstructuredMarkdownLoader
+from langchain.schema import Document
+from langchain.embeddings import OpenAIEmbeddings
+from langchain.vectorstores import Weaviate
+from .documentTypes import DocumentType
+from config import config
+import logging
+import weaviate
+from typing import Iterable, List
+from langchain.text_splitter import NLTKTextSplitter
+from internal.weaviate.weaviate import WeaviateClient

-from langchain.text_splitter import SpacyTextSplitter
+# Globals
+# =======
+# The text splitter
+
+textSplitter = NLTKTextSplitter(chunk_size=config["app.document.split_chunk_size"],
+                 chunk_overlap=config["app.document.split_chunk_overlap"])
+"""
+The text splitter client
+"""
+
+db = None
+
+
+
+def loadDocumentIntoWeaviate(documentType: DocumentType,
+                             document_name : str,
+                             path: str) -> None:
+    """
+    Loads a document into Weaviate.
+    """
+    global db
+
+    documents = None
+
+    if documentType == DocumentType.markdown:
+        loader = UnstructuredMarkdownLoader(path)
+        documents = loader.load()
+    else:
+        raise Exception("Document type not supported.")
+
+    # Split up the document
+    texts = splitDocument(documents)
+
+    for text in texts:
+        text.metadata["document_name"] = document_name
+
+    if db is None:
+        print(config["app.weaviate.url"])
+        client = weaviate.Client(config["app.weaviate.url"])
+        db = WeaviateClient(client,
+                            text_key=config["app.weaviate.text-field"],
+                            index_name=config["app.weaviate.index-name"])
+
+    db.addDocuments(documents=texts)
+
+    logging.info(f"Loaded document {len(documents)} into Weaviate")
+
+
+def splitDocument(document: Document | Iterable[Document]) -> List[Document]:
+    """
+    Splits a document into multiple documents using spaCy
+    """
+    if document is None:
+        raise Exception("Document is None")
+
+    global textSplitter
+
+    return textSplitter.split_documents(document)

--- a/src/internal/embeddings/EmbeddingProvider.py
+++ b/src/internal/embeddings/EmbeddingProvider.py
@ -0,0 +1,11 @@
+from abc import ABC, abstractmethod
+from typing import List
+
+class EmbeddingProvider(ABC):
+    @abstractmethod
+    def getEmbedding(self, word: str) -> List[float]:
+        """
+        Returns the embedding for the given word
+        """
+        pass
+
--- a/src/internal/embeddings/OpenAIEmbeddingProvider.py
+++ b/src/internal/embeddings/OpenAIEmbeddingProvider.py
@ -0,0 +1,48 @@
+from .EmbeddingProvider import EmbeddingProvider
+from typing import List
+import openai
+from config import config
+
+class OpenAIEmbeddingProvider(EmbeddingProvider):
+
+    def __init__(self):
+        super().__init__()
+        self.openai_api_key = config["app.openai.api_key"]
+        self.openai_url = config["app.openai.url"]
+        self.model = config["app.document.embeddings_model"]
+
+    def getEmbedding(self, text: str) -> List[float]:
+        """
+        Returns the embedding for the given string
+        """
+        openai.api_key = self.openai_api_key
+        openai.api_base = self.openai_url
+        return openai.Embedding.create(input = [text],
+                                model=self.model)['data'][0]['embedding']
+
+    @property
+    def OPENAI_API_KEY(self):
+        return self.openai_api_key
+
+    @OPENAI_API_KEY.setter
+    def OPENAI_API_KEY(self, value):
+        self.openai_api_key = value
+
+    @property
+    def OPENAI_URL(self):
+        return self.openai_url
+
+    @OPENAI_URL.setter
+    def OPENAI_URL(self, value):
+        self.openai_url = value
+
+    @property
+    def MODEL(self):
+        return self.model
+
+    @MODEL.setter
+    def MODEL(self, value):
+        self.model = value
+
+
+
--- a/src/internal/embeddings/init.py
+++ b/src/internal/embeddings/init.py
--- a/src/internal/weaviate/weaviate.py
+++ b/src/internal/weaviate/weaviate.py
@ -0,0 +1,121 @@
+import weaviate
+import json
+import logging
+from typing import Iterable, List
+from internal.embeddings.OpenAIEmbeddingProvider import OpenAIEmbeddingProvider
+from internal.embeddings.EmbeddingProvider import EmbeddingProvider
+from langchain.schema import Document
+
+class WeaviateClient:
+    # Constructor
+    def __init__(self, client: weaviate.Client,
+                 text_key: str = "content",
+                 index_name: str = "documents",
+                 embeddingProvider: EmbeddingProvider = OpenAIEmbeddingProvider(),
+                 ):
+        self.client = client
+        self.embeddingProvider = embeddingProvider
+        self.text_key = text_key
+        self.index_name = index_name
+
+    @classmethod
+    def fromUrl(cls, endpoint: str):
+        """
+        Creates a WeaviateClient from an endpoint
+        """
+        client = weaviate.Client(endpoint)
+        return cls(client)
+
+    def addDocuments(self, documents: Iterable[Document]) -> None:
+        """
+        Adds a list of documents to the store
+        """
+        if self.embeddingProvider is None:
+            raise Exception("No embedding provider set")
+
+        with self.client.batch as batch:
+            for i, text in enumerate(documents):
+                data_properties = {self.text_key: text.page_content}
+                if text.metadata is not None:
+                    for key, val in text.metadata.items():
+                        data_properties[key] = val
+
+                vector = self.embeddingProvider.getEmbedding(text.page_content)
+                batch.add_data_object(
+                        data_object=data_properties,
+                        class_name=self.index_name,
+                        vector=vector
+                    )
+
+
+
+    def addDocument(self, document: Document) -> None:
+        """
+        Adds a given document to the store
+        """
+        self.addDocuments([document])
+
+    def similaritySearch(self, query: str, k: int = 10) -> List[str]:
+        """
+        Searches for similar documents
+
+        Args:
+          query: Text to lookup
+          k: Number of results to return, default 10
+        """
+        pass
+
+    def removeDocument(self, document_id: str) -> None:
+        """
+        Removes a document from the store
+
+        Args:
+          document: Document to remove
+        """
+        pass
+
+    def getDocument(self, document_id: str) -> str:
+        """
+        Returns a document from the store
+
+        Args:
+          document: Document to return
+        """
+        pass
+
+
+    @property
+    def ENDPOINT(self):
+        return self.endpoint
+
+    @ENDPOINT.setter
+    def ENDPOINT(self, endpoint: str):
+        self.endpoint = endpoint
+
+    @property
+    def EMBEDDING_PROVIDER(self):
+        return self.embeddingProvider
+
+    @EMBEDDING_PROVIDER.setter
+    def EMBEDDING_PROVIDER(self, embeddingProvider: EmbeddingProvider):
+        self.embeddingProvider = embeddingProvider
+
+    @property
+    def TEXT_KEY(self):
+        return self.text_key
+
+    @TEXT_KEY.setter
+    def TEXT_KEY(self, text_key: str):
+        self.text_key = text_key
+
+    @property
+    def INDEX_NAME(self):
+        return self.index_name
+
+    @INDEX_NAME.setter
+    def INDEX_NAME(self, index_name: str):
+        self.index_name = index_name
+
+
+
+
--- a/src/main.py
+++ b/src/main.py
@ -4,18 +4,19 @@ from fastapi.responses import RedirectResponse
 from routers import question, documents

 import logging
-from config import config, initLogger
+from config import config, initLogger, initEnvironment, initWeviate

+initEnvironment()
 initLogger()

+# Init weaviate, if not done already
+initWeviate()
+
 app = FastAPI()

 app.include_router(question.router)
 app.include_router(documents.router)

-logging.warn("Test message")
-
-
@app.get("/")
 async def root():
    """
--- a/src/routers/documents.py
+++ b/src/routers/documents.py
@ -1,6 +1,11 @@
-from fastapi import APIRouter
+from fastapi import APIRouter, UploadFile
 from pydantic import BaseModel
 from enum import Enum
+import tempfile
+import logging
+from internal.documentTypes import DocumentType
+
+from internal import documents

 router = APIRouter(
    prefix="/documents",
@ -8,21 +13,6 @@ router = APIRouter(
    responses={404: {"description": "Not found"}},
    )

-class Document(BaseModel):
-    id: int
-    title: str
-    content: str
-
-# Document Type enum
-class DocumentType(str, Enum):
-    markdown = "md"
-    #html = "html"
-    #pdf = "pdf"
-    epub = "epub"
-    odt = "odt"
-    docx = "docx"
-
-
@router.get("/{document_type}")
 async def read_documents(document_type: DocumentType):
    """
@ -31,30 +21,54 @@ async def read_documents(document_type: DocumentType):
    pass

@router.get("/{document_id}")
-async def read_document(document_id: int):
+async def read_document(document_id: str):
    """
    Get a specific document
    """
    pass

@router.post("/")
-async def create_document(document: Document):
+async def create_document(document_type: DocumentType,
+                          file: UploadFile):
    """
    Create a new document
    """
-    pass
+    tmp = tempfile.NamedTemporaryFile(delete=True)
+    document_id = file.filename
+
+    try:
+        logging.info(f"Uploaded file {file.filename} to {tmp.name}")
+
+        # Write the file to a temporary file
+        tmp.write(await file.read())
+        tmp.flush()
+
+        # Load the document
+        documents.loadDocumentIntoWeaviate(document_type, document_id, tmp.name)
+
+    finally:
+        tmp.close()
+
+    return {"document_id": document_id}

@router.put("/{document_id}")
-async def update_document(document_id: int, document: Document):
+async def update_document(document_id: str):
    """
    Update a document
    """
-    pass

@router.delete("/{document_id}")
-async def delete_document(document_id: int):
+async def delete_document(document_id: str):
    """
    Delete a document
    """
    pass

+@router.get("/find")
+async def find_document(query: str):
+    """
+    Finds a document with content similar to given query
+    """
+    pass
+
+
--- a/src/routers/question.py
+++ b/src/routers/question.py
@ -22,4 +22,3 @@ async def ask_question(question: Question, conversation_id: str):
@router.get("/{conversation_id}}")
 async def get_question_history(conversation_id: str):
    return {"message": f"Hello question {conversation_id}!"}
-
--- a/src/schema.json
+++ b/src/schema.json
@ -0,0 +1,21 @@
+{
+    "classes": [
+        {
+            "class": "knowledge",
+            "description": "Knowledge for the language models",
+            "vectorizer": "none",
+            "properties": [
+                {
+                    "name": "content",
+                    "description": "The content of the document",
+                    "dataType": ["text"]
+                },
+                {
+                    "name": "document_id",
+                    "description": "The id of the document, user facing",
+                    "dataType": ["text"]
+                }
+            ]
+        }
+    ]
+}