chore: updated chonkie and temp fix for azure embeddings registry

- TODO: Raise PR in upstream for fix in next version
This commit is contained in:
DESKTOP-RTLN3BA\$punk 2025-10-29 14:42:05 -07:00
parent 5addc317f0
commit 71e4860495
4 changed files with 650 additions and 284 deletions

View file

@ -39,6 +39,19 @@ AIRTABLE_CLIENT_SECRET=your_airtable_client_secret
AIRTABLE_REDIRECT_URI=http://localhost:8000/api/v1/auth/airtable/connector/callback
# Embedding Model
# Examples:
# # Get sentence transformers embeddings
# embeddings = AutoEmbeddings.get_embeddings("sentence-transformers/all-MiniLM-L6-v2")
# # Get OpenAI embeddings
# embeddings = AutoEmbeddings.get_embeddings("openai://text-embedding-ada-002", api_key="...")
# # Get Anthropic embeddings
# embeddings = AutoEmbeddings.get_embeddings("anthropic://claude-v1", api_key="...")
# # Get Cohere embeddings
# embeddings = AutoEmbeddings.get_embeddings("cohere://embed-english-light-v3.0", api_key="...")
EMBEDDING_MODEL=sentence-transformers/all-MiniLM-L6-v2
RERANKERS_MODEL_NAME=ms-marco-MiniLM-L-12-v2

View file

@ -3,9 +3,27 @@ import shutil
from pathlib import Path
from chonkie import AutoEmbeddings, CodeChunker, RecursiveChunker
from chonkie.embeddings.azure_openai import AzureOpenAIEmbeddings
from chonkie.embeddings.registry import EmbeddingsRegistry
from dotenv import load_dotenv
from rerankers import Reranker
# TODO: Fix this in chonkie upstream
# Register Azure OpenAI embeddings with pattern
# This automatically infers the following arguments from their corresponding environment variables if they are not provided:
# - `api_key` from `AZURE_OPENAI_API_KEY`
# - `organization` from `OPENAI_ORG_ID`
# - `project` from `OPENAI_PROJECT_ID`
# - `azure_ad_token` from `AZURE_OPENAI_AD_TOKEN`
# - `api_version` from `OPENAI_API_VERSION`
# - `azure_endpoint` from `AZURE_OPENAI_ENDPOINT`
EmbeddingsRegistry.register_provider("azure_openai", AzureOpenAIEmbeddings)
EmbeddingsRegistry.register_pattern(r"^text-embedding-", AzureOpenAIEmbeddings)
EmbeddingsRegistry.register_model("text-embedding-ada-002", AzureOpenAIEmbeddings)
EmbeddingsRegistry.register_model("text-embedding-3-small", AzureOpenAIEmbeddings)
EmbeddingsRegistry.register_model("text-embedding-3-large", AzureOpenAIEmbeddings)
# Get the base directory of the project
BASE_DIR = Path(__file__).resolve().parent.parent.parent

View file

@ -7,7 +7,6 @@ requires-python = ">=3.12"
dependencies = [
"alembic>=1.13.0",
"asyncpg>=0.30.0",
"chonkie[all]>=1.0.6",
"discord-py>=2.5.2",
"docling>=2.15.0",
"fastapi>=0.115.8",
@ -48,6 +47,7 @@ dependencies = [
"celery[redis]>=5.5.3",
"flower>=2.0.1",
"redis>=5.2.1",
"chonkie[all]>=1.4.0",
]
[dependency-groups]

File diff suppressed because it is too large Load diff