Add query function

2023-06-12 23:42:56 -04:00 · 2023-06-12 23:42:56 -04:00 · 70a12c5820
commit 70a12c5820
parent 084e1cd383
5 changed files with 149 additions and 54 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,23 @@
+---
+version: '3.4'
+services:
+  weaviate:
+    command:
+    - --host
+    - 0.0.0.0
+    - --port
+    - '12345'
+    - --scheme
+    - http
+    image: semitechnologies/weaviate:1.19.7
+    ports:
+    - 12345:12345
+    restart: on-failure:0
+    environment:
+      QUERY_DEFAULTS_LIMIT: 25
+      AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
+      PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
+      DEFAULT_VECTORIZER_MODULE: 'none'
+      ENABLE_MODULES: ''
+      CLUSTER_HOSTNAME: 'node1'
+...
--- a/src/config.yaml
+++ b/src/config.yaml
@ -4,7 +4,7 @@ app:
  weaviate:
    url: "http://localhost:12345"
    text-field: "content"
-    index-name: "knowledge"
+    index-name: "Knowledge"
    schema-path: "schema.json"
  openai:
    url: "http://192.168.1.104:11111"
--- a/src/internal/documents.py
+++ b/src/internal/documents.py
@ -22,16 +22,16 @@ The text splitter client

 db = None

-
-
-def loadDocumentIntoWeaviate(documentType: DocumentType,
-                             document_name : str,
-                             path: str) -> None:
+def loadDocumentsIntoWeaviate(documentType: DocumentType,
+                              documents: dict) -> None:
    """
    Loads a document into Weaviate.
-    """
-    global db

+    Expects documents to be of formate {name -> path}
+    """
+    texts = []
+
+    for name, path in documents.items():
        documents = None

        if documentType == DocumentType.markdown:
@ -41,26 +41,38 @@ def loadDocumentIntoWeaviate(documentType: DocumentType,
            raise Exception("Document type not supported.")

        # Split up the document
-    texts = splitDocument(documents)
+        newDocuments = splitDocuments(documents)
+        for doc in newDocuments:
+            doc.metadata["document_name"] = name

-    for text in texts:
-        text.metadata["document_name"] = document_name
+        texts.extend(newDocuments)

-    if db is None:
-        print(config["app.weaviate.url"])
-        client = weaviate.Client(config["app.weaviate.url"])
-        db = WeaviateClient(client,
-                            text_key=config["app.weaviate.text-field"],
-                            index_name=config["app.weaviate.index-name"])
+    db = getDatabase()
+
+    logging.info(f"Parsed {len(documents)} documents")

    db.addDocuments(documents=texts)

    logging.info(f"Loaded document {len(documents)} into Weaviate")

-
-def splitDocument(document: Document | Iterable[Document]) -> List[Document]:
+def loadDocumentIntoWeaviate(documentType: DocumentType,
+                             document_name : str,
+                             path: str) -> None:
    """
-    Splits a document into multiple documents using spaCy
+    Loads a document into Weaviate.
+    """
+    loadDocumentsIntoWeaviate(documentType, {document_name: path})
+
+def findSimilarDocuments(query: str, limit : int = 10) -> List[Document]:
+    """
+    Finds similar documents to the query
+    """
+    db = getDatabase()
+    return db.similaritySearch(query=query, k=limit)
+
+def splitDocuments(document: Iterable[Document]) -> List[Document]:
+    """
+    Splits a document into multiple documents
    """
    if document is None:
        raise Exception("Document is None")
@ -69,3 +81,17 @@ def splitDocument(document: Document | Iterable[Document]) -> List[Document]:

    return textSplitter.split_documents(document)

+def getDatabase() -> WeaviateClient:
+    """
+    Get a weaviate client instance
+    """
+    global db
+
+    if db is None:
+        logging.debug(config["app.weaviate.url"])
+        client = weaviate.Client(config["app.weaviate.url"])
+        db = WeaviateClient(client,
+                            text_key = config["app.weaviate.text-field"],
+                            index_name = config["app.weaviate.index-name"])
+
+    return db
--- a/src/internal/weaviate/weaviate.py
+++ b/src/internal/weaviate/weaviate.py
@ -47,15 +47,13 @@ class WeaviateClient:
                        vector=vector
                    )

-
-
    def addDocument(self, document: Document) -> None:
        """
        Adds a given document to the store
        """
        self.addDocuments([document])

-    def similaritySearch(self, query: str, k: int = 10) -> List[str]:
+    def similaritySearch(self, query: str, k: int = 10) -> List[Document]:
        """
        Searches for similar documents

@ -63,7 +61,34 @@ class WeaviateClient:
          query: Text to lookup
          k: Number of results to return, default 10
        """
-        pass
+        if self.embeddingProvider is None:
+            raise Exception("No embedding provider set")
+        vector = self.embeddingProvider.getEmbedding(query)
+        return self.similaritySearchByVector(vector, k)
+
+    def similaritySearchByVector(self, vector: List[float], k: int = 10) -> List[Document]:
+        """
+        Searches for similar documents
+
+        Args:
+          vector: Vector to lookup
+          k: Number of results to return, default 10
+        """
+        vectorQuery = { "vector": vector }
+        query_obj = self.client.query.get(self.index_name, ["content", "document_name"])
+        result = query_obj.with_near_vector(vectorQuery).with_limit(k).do()
+
+        if "errors" in result:
+            raise Exception(result["errors"])
+        results = []
+
+        print(result)
+
+        for res in result["data"]["Get"][self.index_name]:
+            text = res.pop(self.text_key)
+            results.append(Document(page_content=text, metadata=res))
+
+        return results

    def removeDocument(self, document_id: str) -> None:
        """
--- a/src/routers/documents.py
+++ b/src/routers/documents.py
@ -1,4 +1,4 @@
-from fastapi import APIRouter, UploadFile
+from fastapi import APIRouter, UploadFile, responses
 from pydantic import BaseModel
 from enum import Enum
 import tempfile
@ -7,55 +7,63 @@ from internal.documentTypes import DocumentType

 from internal import documents

+# set logger to debug
+logging.basicConfig(level=logging.DEBUG)
+
 router = APIRouter(
    prefix="/documents",
    tags=["documents"],
    responses={404: {"description": "Not found"}},
    )

-@router.get("/{document_type}")
-async def read_documents(document_type: DocumentType):
+@router.get("/")
+async def read_documents():
    """
    Get all documents
    """
    pass

-@router.get("/{document_id}")
-async def read_document(document_id: str):
-    """
-    Get a specific document
-    """
-    pass

@router.post("/")
 async def create_document(document_type: DocumentType,
-                          file: UploadFile):
+                          files: list[UploadFile]):
    """
    Create a new document
    """
+
+    tmpFiles = []
+    docs = {}
+
+    for file in files:
+        logging.info(f"Uploaded file {file.filename}")
+
        tmp = tempfile.NamedTemporaryFile(delete=True)
        document_id = file.filename
-
-    try:
-        logging.info(f"Uploaded file {file.filename} to {tmp.name}")
+        path = tmp.name

        # Write the file to a temporary file
        tmp.write(await file.read())
        tmp.flush()

-        # Load the document
-        documents.loadDocumentIntoWeaviate(document_type, document_id, tmp.name)
+        docs[document_id] = path

-    finally:
+        tmpFiles.append(tmp)
+
+    documents.loadDocumentsIntoWeaviate(document_type, docs)
+
+    for tmp in tmpFiles:
        tmp.close()

-    return {"document_id": document_id}
+    return {
+            "documents": [ key for key in docs ]
+            }

@router.put("/{document_id}")
 async def update_document(document_id: str):
    """
    Update a document
    """
+    pass

@router.delete("/{document_id}")
 async def delete_document(document_id: str):
@ -65,10 +73,23 @@ async def delete_document(document_id: str):
    pass

@router.get("/find")
-async def find_document(query: str):
+async def find_document(query: str, limit: int = 10):
    """
    Finds a document with content similar to given query
    """
-    pass
+
+    docs = documents.findSimilarDocuments(query, limit)
+
+    response = []
+
+    # format response
+    for doc in docs:
+        response.append({
+            "document_id": doc.metadata["document_name"],
+            "content": doc.page_content
+            })
+    return response
+
+