Add query function

This commit is contained in:
Tyler Perkins 2023-06-12 23:42:56 -04:00
parent 084e1cd383
commit 70a12c5820
5 changed files with 149 additions and 54 deletions

23
docker-compose.yml Normal file
View File

@ -0,0 +1,23 @@
---
version: '3.4'
services:
weaviate:
command:
- --host
- 0.0.0.0
- --port
- '12345'
- --scheme
- http
image: semitechnologies/weaviate:1.19.7
ports:
- 12345:12345
restart: on-failure:0
environment:
QUERY_DEFAULTS_LIMIT: 25
AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true'
PERSISTENCE_DATA_PATH: '/var/lib/weaviate'
DEFAULT_VECTORIZER_MODULE: 'none'
ENABLE_MODULES: ''
CLUSTER_HOSTNAME: 'node1'
...

View File

@ -4,7 +4,7 @@ app:
weaviate: weaviate:
url: "http://localhost:12345" url: "http://localhost:12345"
text-field: "content" text-field: "content"
index-name: "knowledge" index-name: "Knowledge"
schema-path: "schema.json" schema-path: "schema.json"
openai: openai:
url: "http://192.168.1.104:11111" url: "http://192.168.1.104:11111"

View File

@ -22,16 +22,16 @@ The text splitter client
db = None db = None
def loadDocumentsIntoWeaviate(documentType: DocumentType,
documents: dict) -> None:
def loadDocumentIntoWeaviate(documentType: DocumentType,
document_name : str,
path: str) -> None:
""" """
Loads a document into Weaviate. Loads a document into Weaviate.
"""
global db
Expects documents to be of formate {name -> path}
"""
texts = []
for name, path in documents.items():
documents = None documents = None
if documentType == DocumentType.markdown: if documentType == DocumentType.markdown:
@ -41,26 +41,38 @@ def loadDocumentIntoWeaviate(documentType: DocumentType,
raise Exception("Document type not supported.") raise Exception("Document type not supported.")
# Split up the document # Split up the document
texts = splitDocument(documents) newDocuments = splitDocuments(documents)
for doc in newDocuments:
doc.metadata["document_name"] = name
for text in texts: texts.extend(newDocuments)
text.metadata["document_name"] = document_name
if db is None: db = getDatabase()
print(config["app.weaviate.url"])
client = weaviate.Client(config["app.weaviate.url"]) logging.info(f"Parsed {len(documents)} documents")
db = WeaviateClient(client,
text_key=config["app.weaviate.text-field"],
index_name=config["app.weaviate.index-name"])
db.addDocuments(documents=texts) db.addDocuments(documents=texts)
logging.info(f"Loaded document {len(documents)} into Weaviate") logging.info(f"Loaded document {len(documents)} into Weaviate")
def loadDocumentIntoWeaviate(documentType: DocumentType,
def splitDocument(document: Document | Iterable[Document]) -> List[Document]: document_name : str,
path: str) -> None:
""" """
Splits a document into multiple documents using spaCy Loads a document into Weaviate.
"""
loadDocumentsIntoWeaviate(documentType, {document_name: path})
def findSimilarDocuments(query: str, limit : int = 10) -> List[Document]:
"""
Finds similar documents to the query
"""
db = getDatabase()
return db.similaritySearch(query=query, k=limit)
def splitDocuments(document: Iterable[Document]) -> List[Document]:
"""
Splits a document into multiple documents
""" """
if document is None: if document is None:
raise Exception("Document is None") raise Exception("Document is None")
@ -69,3 +81,17 @@ def splitDocument(document: Document | Iterable[Document]) -> List[Document]:
return textSplitter.split_documents(document) return textSplitter.split_documents(document)
def getDatabase() -> WeaviateClient:
"""
Get a weaviate client instance
"""
global db
if db is None:
logging.debug(config["app.weaviate.url"])
client = weaviate.Client(config["app.weaviate.url"])
db = WeaviateClient(client,
text_key = config["app.weaviate.text-field"],
index_name = config["app.weaviate.index-name"])
return db

View File

@ -47,15 +47,13 @@ class WeaviateClient:
vector=vector vector=vector
) )
def addDocument(self, document: Document) -> None: def addDocument(self, document: Document) -> None:
""" """
Adds a given document to the store Adds a given document to the store
""" """
self.addDocuments([document]) self.addDocuments([document])
def similaritySearch(self, query: str, k: int = 10) -> List[str]: def similaritySearch(self, query: str, k: int = 10) -> List[Document]:
""" """
Searches for similar documents Searches for similar documents
@ -63,7 +61,34 @@ class WeaviateClient:
query: Text to lookup query: Text to lookup
k: Number of results to return, default 10 k: Number of results to return, default 10
""" """
pass if self.embeddingProvider is None:
raise Exception("No embedding provider set")
vector = self.embeddingProvider.getEmbedding(query)
return self.similaritySearchByVector(vector, k)
def similaritySearchByVector(self, vector: List[float], k: int = 10) -> List[Document]:
"""
Searches for similar documents
Args:
vector: Vector to lookup
k: Number of results to return, default 10
"""
vectorQuery = { "vector": vector }
query_obj = self.client.query.get(self.index_name, ["content", "document_name"])
result = query_obj.with_near_vector(vectorQuery).with_limit(k).do()
if "errors" in result:
raise Exception(result["errors"])
results = []
print(result)
for res in result["data"]["Get"][self.index_name]:
text = res.pop(self.text_key)
results.append(Document(page_content=text, metadata=res))
return results
def removeDocument(self, document_id: str) -> None: def removeDocument(self, document_id: str) -> None:
""" """

View File

@ -1,4 +1,4 @@
from fastapi import APIRouter, UploadFile from fastapi import APIRouter, UploadFile, responses
from pydantic import BaseModel from pydantic import BaseModel
from enum import Enum from enum import Enum
import tempfile import tempfile
@ -7,55 +7,63 @@ from internal.documentTypes import DocumentType
from internal import documents from internal import documents
# set logger to debug
logging.basicConfig(level=logging.DEBUG)
router = APIRouter( router = APIRouter(
prefix="/documents", prefix="/documents",
tags=["documents"], tags=["documents"],
responses={404: {"description": "Not found"}}, responses={404: {"description": "Not found"}},
) )
@router.get("/{document_type}") @router.get("/")
async def read_documents(document_type: DocumentType): async def read_documents():
""" """
Get all documents Get all documents
""" """
pass pass
@router.get("/{document_id}")
async def read_document(document_id: str):
"""
Get a specific document
"""
pass
@router.post("/") @router.post("/")
async def create_document(document_type: DocumentType, async def create_document(document_type: DocumentType,
file: UploadFile): files: list[UploadFile]):
""" """
Create a new document Create a new document
""" """
tmpFiles = []
docs = {}
for file in files:
logging.info(f"Uploaded file {file.filename}")
tmp = tempfile.NamedTemporaryFile(delete=True) tmp = tempfile.NamedTemporaryFile(delete=True)
document_id = file.filename document_id = file.filename
path = tmp.name
try:
logging.info(f"Uploaded file {file.filename} to {tmp.name}")
# Write the file to a temporary file # Write the file to a temporary file
tmp.write(await file.read()) tmp.write(await file.read())
tmp.flush() tmp.flush()
# Load the document docs[document_id] = path
documents.loadDocumentIntoWeaviate(document_type, document_id, tmp.name)
finally: tmpFiles.append(tmp)
documents.loadDocumentsIntoWeaviate(document_type, docs)
for tmp in tmpFiles:
tmp.close() tmp.close()
return {"document_id": document_id} return {
"documents": [ key for key in docs ]
}
@router.put("/{document_id}") @router.put("/{document_id}")
async def update_document(document_id: str): async def update_document(document_id: str):
""" """
Update a document Update a document
""" """
pass
@router.delete("/{document_id}") @router.delete("/{document_id}")
async def delete_document(document_id: str): async def delete_document(document_id: str):
@ -65,10 +73,23 @@ async def delete_document(document_id: str):
pass pass
@router.get("/find") @router.get("/find")
async def find_document(query: str): async def find_document(query: str, limit: int = 10):
""" """
Finds a document with content similar to given query Finds a document with content similar to given query
""" """
pass
docs = documents.findSimilarDocuments(query, limit)
response = []
# format response
for doc in docs:
response.append({
"document_id": doc.metadata["document_name"],
"content": doc.page_content
})
return response