mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-01 18:19:08 +00:00
- Added DOCLING as third ETL_SERVICE option (alongside UNSTRUCTURED/LLAMACLOUD) - Implemented add_received_file_document_using_docling function - Added Docling processing logic in documents_routes.py - Enhanced chunking with configurable overlap support - Added comprehensive document processing service - Supports both CPU and GPU processing with user selection Addresses #161 - Add Docling Support as an ETL_SERVICE Follows same pattern as LlamaCloud integration (PR #123)
68 lines
No EOL
2.4 KiB
Docker
68 lines
No EOL
2.4 KiB
Docker
FROM python:3.12-slim
|
|
|
|
WORKDIR /app
|
|
|
|
# Install system dependencies including SSL tools, CUDA dependencies, and Tesseract OCR
|
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
|
gcc \
|
|
python3-dev \
|
|
ca-certificates \
|
|
curl \
|
|
wget \
|
|
unzip \
|
|
gnupg2 \
|
|
tesseract-ocr \
|
|
tesseract-ocr-eng \
|
|
libtesseract-dev \
|
|
libleptonica-dev \
|
|
&& rm -rf /var/lib/apt/lists/*
|
|
|
|
# Update certificates and install SSL tools
|
|
RUN update-ca-certificates
|
|
RUN pip install --upgrade certifi pip-system-certs
|
|
|
|
# Copy requirements
|
|
COPY pyproject.toml .
|
|
COPY uv.lock .
|
|
|
|
# Install CUDA-enabled PyTorch for WSL2 before other dependencies
|
|
RUN pip install --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
|
|
|
|
# Install python dependencies
|
|
RUN pip install --no-cache-dir uv && \
|
|
uv pip install --system --no-cache-dir -e .
|
|
|
|
# Set SSL environment variables dynamically
|
|
RUN CERTIFI_PATH=$(python -c "import certifi; print(certifi.where())") && \
|
|
echo "Setting SSL_CERT_FILE to $CERTIFI_PATH" && \
|
|
echo "export SSL_CERT_FILE=$CERTIFI_PATH" >> /root/.bashrc && \
|
|
echo "export REQUESTS_CA_BUNDLE=$CERTIFI_PATH" >> /root/.bashrc
|
|
ENV SSL_CERT_FILE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
|
|
ENV REQUESTS_CA_BUNDLE=/usr/local/lib/python3.12/site-packages/certifi/cacert.pem
|
|
|
|
# Pre-download EasyOCR models to avoid runtime SSL issues
|
|
RUN mkdir -p /root/.EasyOCR/model
|
|
RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/v1.3/english_g2.zip -O /root/.EasyOCR/model/english_g2.zip || true
|
|
RUN wget --no-check-certificate https://github.com/JaidedAI/EasyOCR/releases/download/pre-v1.1.6/craft_mlt_25k.zip -O /root/.EasyOCR/model/craft_mlt_25k.zip || true
|
|
RUN cd /root/.EasyOCR/model && (unzip -o english_g2.zip || true) && (unzip -o craft_mlt_25k.zip || true)
|
|
|
|
# Pre-download Docling models
|
|
RUN python -c "try:\n from docling.document_converter import DocumentConverter\n conv = DocumentConverter()\nexcept:\n pass" || true
|
|
|
|
# Install Playwright browsers for web scraping if needed
|
|
RUN pip install playwright && \
|
|
playwright install --with-deps chromium
|
|
|
|
# Copy source code
|
|
COPY . .
|
|
|
|
# Prevent uvloop compatibility issues
|
|
ENV PYTHONPATH=/app
|
|
ENV UVICORN_LOOP=asyncio
|
|
|
|
# Set Tesseract data path
|
|
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
|
|
|
|
# Run
|
|
EXPOSE 8000
|
|
CMD ["python", "main.py"] |