Add query function

2023-06-12 23:42:56 -04:00 · 2023-06-12 23:42:56 -04:00 · 70a12c5820
commit 70a12c5820
parent 084e1cd383
5 changed files with 149 additions and 54 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -0,0 +1,23 @@
 ---
 version: '3.4'
 services:
  weaviate:
    command:
    - --host
    - 0.0.0.0
    - --port
    - '12345'
    - --scheme
    - http
    image: semitechnologies/weaviate:1.19.7
    ports:
    - 12345:12345
    restart: on-failure:0
    environment:
      QUERY_DEFAULTS_LIMIT: 25
      AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
      PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
      DEFAULT_VECTORIZER_MODULE: 'none'
      ENABLE_MODULES: ''
      CLUSTER_HOSTNAME: 'node1'
 ...
--- a/src/config.yaml
+++ b/src/config.yaml
@ -4,7 +4,7 @@ app:
  weaviate:
    url: "http://localhost:12345"
    text-field: "content"
-    index-name: "knowledge"
+    index-name: "Knowledge"
    schema-path: "schema.json"
  openai:
    url: "http://192.168.1.104:11111"
--- a/src/internal/documents.py
+++ b/src/internal/documents.py
@ -22,7 +22,38 @@ The text splitter client
 db = None
 def loadDocumentsIntoWeaviate(documentType: DocumentType,
                              documents: dict) -> None:
    """
    Loads a document into Weaviate.
    Expects documents to be of formate {name -> path}
    """
    texts = []
    for name, path in documents.items():
        documents = None
        if documentType == DocumentType.markdown:
            loader = UnstructuredMarkdownLoader(path)
            documents = loader.load()
        else:
            raise Exception("Document type not supported.")
        # Split up the document
        newDocuments = splitDocuments(documents)
        for doc in newDocuments:
            doc.metadata["document_name"] = name
        texts.extend(newDocuments)
    db = getDatabase()
    logging.info(f"Parsed {len(documents)} documents")
    db.addDocuments(documents=texts)
    logging.info(f"Loaded document {len(documents)} into Weaviate")
 def loadDocumentIntoWeaviate(documentType: DocumentType,
                             document_name : str,
@ -30,37 +61,18 @@ def loadDocumentIntoWeaviate(documentType: DocumentType,
    """
    Loads a document into Weaviate.
    """
-    global db
+    loadDocumentsIntoWeaviate(documentType, {document_name: path})
-    documents = None
+def findSimilarDocuments(query: str, limit : int = 10) -> List[Document]:
    if documentType == DocumentType.markdown:
        loader = UnstructuredMarkdownLoader(path)
        documents = loader.load()
    else:
        raise Exception("Document type not supported.")
    # Split up the document
    texts = splitDocument(documents)
    for text in texts:
        text.metadata["document_name"] = document_name
    if db is None:
        print(config["app.weaviate.url"])
        client = weaviate.Client(config["app.weaviate.url"])
        db = WeaviateClient(client,
                            text_key=config["app.weaviate.text-field"],
                            index_name=config["app.weaviate.index-name"])
    db.addDocuments(documents=texts)
    logging.info(f"Loaded document {len(documents)} into Weaviate")
 def splitDocument(document: Document | Iterable[Document]) -> List[Document]:
    """
-    Splits a document into multiple documents using spaCy
+    Finds similar documents to the query
    """
    db = getDatabase()
    return db.similaritySearch(query=query, k=limit)
 def splitDocuments(document: Iterable[Document]) -> List[Document]:
    """
    Splits a document into multiple documents
    """
    if document is None:
        raise Exception("Document is None")
@ -69,3 +81,17 @@ def splitDocument(document: Document | Iterable[Document]) -> List[Document]:
    return textSplitter.split_documents(document)
 def getDatabase() -> WeaviateClient:
    """
    Get a weaviate client instance
    """
    global db
    if db is None:
        logging.debug(config["app.weaviate.url"])
        client = weaviate.Client(config["app.weaviate.url"])
        db = WeaviateClient(client,
                            text_key = config["app.weaviate.text-field"],
                            index_name = config["app.weaviate.index-name"])
    return db
--- a/src/internal/weaviate/weaviate.py
+++ b/src/internal/weaviate/weaviate.py
@ -47,15 +47,13 @@ class WeaviateClient:
                        vector=vector
                    )
    def addDocument(self, document: Document) -> None:
        """
        Adds a given document to the store
        """
        self.addDocuments([document])
-    def similaritySearch(self, query: str, k: int = 10) -> List[str]:
+    def similaritySearch(self, query: str, k: int = 10) -> List[Document]:
        """
        Searches for similar documents
@ -63,7 +61,34 @@ class WeaviateClient:
          query: Text to lookup
          k: Number of results to return, default 10
        """
-        pass
+        if self.embeddingProvider is None:
            raise Exception("No embedding provider set")
        vector = self.embeddingProvider.getEmbedding(query)
        return self.similaritySearchByVector(vector, k)
    def similaritySearchByVector(self, vector: List[float], k: int = 10) -> List[Document]:
        """
        Searches for similar documents
        Args:
          vector: Vector to lookup
          k: Number of results to return, default 10
        """
        vectorQuery = { "vector": vector }
        query_obj = self.client.query.get(self.index_name, ["content", "document_name"])
        result = query_obj.with_near_vector(vectorQuery).with_limit(k).do()
        if "errors" in result:
            raise Exception(result["errors"])
        results = []
        print(result)
        for res in result["data"]["Get"][self.index_name]:
            text = res.pop(self.text_key)
            results.append(Document(page_content=text, metadata=res))
        return results
    def removeDocument(self, document_id: str) -> None:
        """
--- a/src/routers/documents.py
+++ b/src/routers/documents.py
@ -1,4 +1,4 @@
-from fastapi import APIRouter, UploadFile
+from fastapi import APIRouter, UploadFile, responses
 from pydantic import BaseModel
 from enum import Enum
 import tempfile
@ -7,55 +7,63 @@ from internal.documentTypes import DocumentType
 from internal import documents
 # set logger to debug
 logging.basicConfig(level=logging.DEBUG)
 router = APIRouter(
    prefix="/documents",
    tags=["documents"],
    responses={404: {"description": "Not found"}},
    )
-@router.get("/{document_type}")
+@router.get("/")
-async def read_documents(document_type: DocumentType):
+async def read_documents():
    """
    Get all documents
    """
    pass
@router.get("/{document_id}")
 async def read_document(document_id: str):
    """
    Get a specific document
    """
    pass
@router.post("/")
 async def create_document(document_type: DocumentType,
-                          file: UploadFile):
+                          files: list[UploadFile]):
    """
    Create a new document
    """
    tmp = tempfile.NamedTemporaryFile(delete=True)
    document_id = file.filename
-    try:
+    tmpFiles = []
-        logging.info(f"Uploaded file {file.filename} to {tmp.name}")
+    docs = {}
    for file in files:
        logging.info(f"Uploaded file {file.filename}")
        tmp = tempfile.NamedTemporaryFile(delete=True)
        document_id = file.filename
        path = tmp.name
        # Write the file to a temporary file
        tmp.write(await file.read())
        tmp.flush()
-        # Load the document
+        docs[document_id] = path
        documents.loadDocumentIntoWeaviate(document_type, document_id, tmp.name)
-    finally:
+        tmpFiles.append(tmp)
    documents.loadDocumentsIntoWeaviate(document_type, docs)
    for tmp in tmpFiles:
        tmp.close()
-    return {"document_id": document_id}
+    return {
            "documents": [ key for key in docs ]
            }
@router.put("/{document_id}")
 async def update_document(document_id: str):
    """
    Update a document
    """
    pass
@router.delete("/{document_id}")
 async def delete_document(document_id: str):
@ -65,10 +73,23 @@ async def delete_document(document_id: str):
    pass
@router.get("/find")
-async def find_document(query: str):
+async def find_document(query: str, limit: int = 10):
    """
    Finds a document with content similar to given query
    """
-    pass
+
    docs = documents.findSimilarDocuments(query, limit)
    response = []
    # format response
    for doc in docs:
        response.append({
            "document_id": doc.metadata["document_name"],
            "content": doc.page_content
            })
    return response