mirror of
https://github.com/MODSetter/SurfSense.git
synced 2026-05-10 03:59:47 +00:00
chore: updated chonkie and temp fix for azure embeddings registry
- TODO: Raise PR in upstream for fix in next version
This commit is contained in:
parent
5addc317f0
commit
71e4860495
4 changed files with 650 additions and 284 deletions
|
|
@ -39,6 +39,19 @@ AIRTABLE_CLIENT_SECRET=your_airtable_client_secret
|
|||
AIRTABLE_REDIRECT_URI=http://localhost:8000/api/v1/auth/airtable/connector/callback
|
||||
|
||||
# Embedding Model
|
||||
# Examples:
|
||||
# # Get sentence transformers embeddings
|
||||
# embeddings = AutoEmbeddings.get_embeddings("sentence-transformers/all-MiniLM-L6-v2")
|
||||
|
||||
# # Get OpenAI embeddings
|
||||
# embeddings = AutoEmbeddings.get_embeddings("openai://text-embedding-ada-002", api_key="...")
|
||||
|
||||
# # Get Anthropic embeddings
|
||||
# embeddings = AutoEmbeddings.get_embeddings("anthropic://claude-v1", api_key="...")
|
||||
|
||||
# # Get Cohere embeddings
|
||||
# embeddings = AutoEmbeddings.get_embeddings("cohere://embed-english-light-v3.0", api_key="...")
|
||||
|
||||
EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
|
||||
|
||||
RERANKERS_MODEL_NAME=ms-marco-MiniLM-L-12-v2
|
||||
|
|
|
|||
|
|
@ -3,9 +3,27 @@ import shutil
|
|||
from pathlib import Path
|
||||
|
||||
from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker
|
||||
from chonkie.embeddings.azure_openai import AzureOpenAIEmbeddings
|
||||
from chonkie.embeddings.registry import EmbeddingsRegistry
|
||||
from dotenv import load_dotenv
|
||||
from rerankers import Reranker
|
||||
|
||||
# TODO: Fix this in chonkie upstream
|
||||
# Register Azure OpenAI embeddings with pattern
|
||||
# This automatically infers the following arguments from their corresponding environment variables if they are not provided:
|
||||
# - `api_key` from `AZURE_OPENAI_API_KEY`
|
||||
# - `organization` from `OPENAI_ORG_ID`
|
||||
# - `project` from `OPENAI_PROJECT_ID`
|
||||
# - `azure_ad_token` from `AZURE_OPENAI_AD_TOKEN`
|
||||
# - `api_version` from `OPENAI_API_VERSION`
|
||||
# - `azure_endpoint` from `AZURE_OPENAI_ENDPOINT`
|
||||
EmbeddingsRegistry.register_provider("azure_openai", AzureOpenAIEmbeddings)
|
||||
EmbeddingsRegistry.register_pattern(r"^text-embedding-", AzureOpenAIEmbeddings)
|
||||
EmbeddingsRegistry.register_model("text-embedding-ada-002", AzureOpenAIEmbeddings)
|
||||
EmbeddingsRegistry.register_model("text-embedding-3-small", AzureOpenAIEmbeddings)
|
||||
EmbeddingsRegistry.register_model("text-embedding-3-large", AzureOpenAIEmbeddings)
|
||||
|
||||
|
||||
# Get the base directory of the project
|
||||
BASE_DIR = Path(__file__).resolve().parent.parent.parent
|
||||
|
||||
|
|
|
|||
|
|
@ -7,7 +7,6 @@ requires-python = ">=3.12"
|
|||
dependencies = [
|
||||
"alembic>=1.13.0",
|
||||
"asyncpg>=0.30.0",
|
||||
"chonkie[all]>=1.0.6",
|
||||
"discord-py>=2.5.2",
|
||||
"docling>=2.15.0",
|
||||
"fastapi>=0.115.8",
|
||||
|
|
@ -48,6 +47,7 @@ dependencies = [
|
|||
"celery[redis]>=5.5.3",
|
||||
"flower>=2.0.1",
|
||||
"redis>=5.2.1",
|
||||
"chonkie[all]>=1.4.0",
|
||||
]
|
||||
|
||||
[dependency-groups]
|
||||
|
|
|
|||
901
surfsense_backend/uv.lock
generated
901
surfsense_backend/uv.lock
generated
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue