mirror of
https://github.com/MODSetter/SurfSense.git
synced 2025-09-04 11:39:19 +00:00
Refactor: Cleanup DOCLING PR
This commit is contained in:
parent
641f784f77
commit
621590c049
5 changed files with 2331 additions and 2326 deletions
2
.gitignore
vendored
2
.gitignore
vendored
|
@ -1,5 +1,3 @@
|
||||||
.flashrank_cache*
|
.flashrank_cache*
|
||||||
podcasts/
|
podcasts/
|
||||||
reports/
|
|
||||||
SURFSENSE_CRITICAL_FIXES_REPORT.md
|
|
||||||
.env
|
.env
|
||||||
|
|
|
@ -11,10 +11,6 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
wget \
|
wget \
|
||||||
unzip \
|
unzip \
|
||||||
gnupg2 \
|
gnupg2 \
|
||||||
tesseract-ocr \
|
|
||||||
tesseract-ocr-eng \
|
|
||||||
libtesseract-dev \
|
|
||||||
libleptonica-dev \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Update certificates and install SSL tools
|
# Update certificates and install SSL tools
|
||||||
|
@ -60,9 +56,6 @@ COPY . .
|
||||||
ENV PYTHONPATH=/app
|
ENV PYTHONPATH=/app
|
||||||
ENV UVICORN_LOOP=asyncio
|
ENV UVICORN_LOOP=asyncio
|
||||||
|
|
||||||
# Set Tesseract data path
|
|
||||||
ENV TESSDATA_PREFIX=/usr/share/tesseract-ocr/5/tessdata/
|
|
||||||
|
|
||||||
# Run
|
# Run
|
||||||
EXPOSE 8000
|
EXPOSE 8000
|
||||||
CMD ["python", "main.py"]
|
CMD ["python", "main.py"]
|
|
@ -30,7 +30,6 @@ dependencies = [
|
||||||
"slack-sdk>=3.34.0",
|
"slack-sdk>=3.34.0",
|
||||||
"static-ffmpeg>=2.13",
|
"static-ffmpeg>=2.13",
|
||||||
"tavily-python>=0.3.2",
|
"tavily-python>=0.3.2",
|
||||||
"tesserocr>=2.8.0",
|
|
||||||
"unstructured-client>=0.30.0",
|
"unstructured-client>=0.30.0",
|
||||||
"unstructured[all-docs]>=0.16.25",
|
"unstructured[all-docs]>=0.16.25",
|
||||||
"uvicorn[standard]>=0.34.0",
|
"uvicorn[standard]>=0.34.0",
|
||||||
|
|
4480
surfsense_backend/uv.lock
generated
4480
surfsense_backend/uv.lock
generated
File diff suppressed because it is too large
Load diff
|
@ -53,83 +53,98 @@ export default function FileUploader() {
|
||||||
};
|
};
|
||||||
|
|
||||||
// Conditionally set accepted file types based on ETL service
|
// Conditionally set accepted file types based on ETL service
|
||||||
const acceptedFileTypes = process.env.NEXT_PUBLIC_ETL_SERVICE === 'LLAMACLOUD'
|
const getAcceptedFileTypes = () => {
|
||||||
? {
|
const etlService = process.env.NEXT_PUBLIC_ETL_SERVICE;
|
||||||
// LlamaCloud supported file types
|
|
||||||
'application/pdf': ['.pdf'],
|
if (etlService === 'LLAMACLOUD') {
|
||||||
'application/msword': ['.doc'],
|
return {
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
|
// LlamaCloud supported file types
|
||||||
'application/vnd.ms-word.document.macroEnabled.12': ['.docm'],
|
'application/pdf': ['.pdf'],
|
||||||
'application/msword-template': ['.dot'],
|
'application/msword': ['.doc'],
|
||||||
'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'],
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
|
||||||
'application/vnd.ms-powerpoint': ['.ppt'],
|
'application/vnd.ms-word.document.macroEnabled.12': ['.docm'],
|
||||||
'application/vnd.ms-powerpoint.template.macroEnabled.12': ['.pptm'],
|
'application/msword-template': ['.dot'],
|
||||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
|
'application/vnd.ms-word.template.macroEnabled.12': ['.dotm'],
|
||||||
'application/vnd.ms-powerpoint.template': ['.pot'],
|
'application/vnd.ms-powerpoint': ['.ppt'],
|
||||||
'application/vnd.openxmlformats-officedocument.presentationml.template': ['.potx'],
|
'application/vnd.ms-powerpoint.template.macroEnabled.12': ['.pptm'],
|
||||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
|
||||||
'application/vnd.ms-excel': ['.xls'],
|
'application/vnd.ms-powerpoint.template': ['.pot'],
|
||||||
'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'],
|
'application/vnd.openxmlformats-officedocument.presentationml.template': ['.potx'],
|
||||||
'application/vnd.ms-excel.sheet.binary.macroEnabled.12': ['.xlsb'],
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
|
||||||
'application/vnd.ms-excel.workspace': ['.xlw'],
|
'application/vnd.ms-excel': ['.xls'],
|
||||||
'application/rtf': ['.rtf'],
|
'application/vnd.ms-excel.sheet.macroEnabled.12': ['.xlsm'],
|
||||||
'application/xml': ['.xml'],
|
'application/vnd.ms-excel.sheet.binary.macroEnabled.12': ['.xlsb'],
|
||||||
'application/epub+zip': ['.epub'],
|
'application/vnd.ms-excel.workspace': ['.xlw'],
|
||||||
'application/vnd.apple.keynote': ['.key'],
|
'application/rtf': ['.rtf'],
|
||||||
'application/vnd.apple.pages': ['.pages'],
|
'application/xml': ['.xml'],
|
||||||
'application/vnd.apple.numbers': ['.numbers'],
|
'application/epub+zip': ['.epub'],
|
||||||
'application/vnd.wordperfect': ['.wpd'],
|
'application/vnd.apple.keynote': ['.key'],
|
||||||
'application/vnd.oasis.opendocument.text': ['.odt'],
|
'application/vnd.apple.pages': ['.pages'],
|
||||||
'application/vnd.oasis.opendocument.presentation': ['.odp'],
|
'application/vnd.apple.numbers': ['.numbers'],
|
||||||
'application/vnd.oasis.opendocument.graphics': ['.odg'],
|
'application/vnd.wordperfect': ['.wpd'],
|
||||||
'application/vnd.oasis.opendocument.spreadsheet': ['.ods'],
|
'application/vnd.oasis.opendocument.text': ['.odt'],
|
||||||
'application/vnd.oasis.opendocument.formula': ['.fods'],
|
'application/vnd.oasis.opendocument.presentation': ['.odp'],
|
||||||
'text/csv': ['.csv'],
|
'application/vnd.oasis.opendocument.graphics': ['.odg'],
|
||||||
'text/tab-separated-values': ['.tsv'],
|
'application/vnd.oasis.opendocument.spreadsheet': ['.ods'],
|
||||||
'text/html': ['.html', '.htm', '.web'],
|
'application/vnd.oasis.opendocument.formula': ['.fods'],
|
||||||
'image/jpeg': ['.jpg', '.jpeg'],
|
'text/csv': ['.csv'],
|
||||||
'image/png': ['.png'],
|
'text/tab-separated-values': ['.tsv'],
|
||||||
'image/gif': ['.gif'],
|
'text/html': ['.html', '.htm', '.web'],
|
||||||
'image/bmp': ['.bmp'],
|
'image/jpeg': ['.jpg', '.jpeg'],
|
||||||
'image/svg+xml': ['.svg'],
|
'image/png': ['.png'],
|
||||||
'image/tiff': ['.tiff'],
|
'image/gif': ['.gif'],
|
||||||
'image/webp': ['.webp'],
|
'image/bmp': ['.bmp'],
|
||||||
'application/dbase': ['.dbf'],
|
'image/svg+xml': ['.svg'],
|
||||||
'application/vnd.lotus-1-2-3': ['.123'],
|
'image/tiff': ['.tiff'],
|
||||||
'text/x-web-markdown': ['.602', '.abw', '.cgm', '.cwk', '.hwp', '.lwp', '.mw', '.mcw', '.pbd', '.sda', '.sdd', '.sdp', '.sdw', '.sgl', '.sti', '.sxi', '.sxw', '.stw', '.sxg', '.uof', '.uop', '.uot', '.vor', '.wps', '.zabw'],
|
'image/webp': ['.webp'],
|
||||||
'text/x-spreadsheet': ['.dif', '.sylk', '.slk', '.prn', '.et', '.uos1', '.uos2', '.wk1', '.wk2', '.wk3', '.wk4', '.wks', '.wq1', '.wq2', '.wb1', '.wb2', '.wb3', '.qpw', '.xlr', '.eth'],
|
'application/dbase': ['.dbf'],
|
||||||
// Audio files (always supported)
|
'application/vnd.lotus-1-2-3': ['.123'],
|
||||||
...audioFileTypes,
|
'text/x-web-markdown': ['.602', '.abw', '.cgm', '.cwk', '.hwp', '.lwp', '.mw', '.mcw', '.pbd', '.sda', '.sdd', '.sdp', '.sdw', '.sgl', '.sti', '.sxi', '.sxw', '.stw', '.sxg', '.uof', '.uop', '.uot', '.vor', '.wps', '.zabw'],
|
||||||
|
'text/x-spreadsheet': ['.dif', '.sylk', '.slk', '.prn', '.et', '.uos1', '.uos2', '.wk1', '.wk2', '.wk3', '.wk4', '.wks', '.wq1', '.wq2', '.wb1', '.wb2', '.wb3', '.qpw', '.xlr', '.eth'],
|
||||||
|
// Audio files (always supported)
|
||||||
|
...audioFileTypes,
|
||||||
|
};
|
||||||
|
} else if (etlService === 'DOCLING') {
|
||||||
|
return {
|
||||||
|
// Docling supported file types (currently only PDF)
|
||||||
|
'application/pdf': ['.pdf'],
|
||||||
|
// Audio files (always supported)
|
||||||
|
...audioFileTypes,
|
||||||
|
};
|
||||||
|
} else {
|
||||||
|
return {
|
||||||
|
// Unstructured supported file types
|
||||||
|
'image/bmp': ['.bmp'],
|
||||||
|
'text/csv': ['.csv'],
|
||||||
|
'application/msword': ['.doc'],
|
||||||
|
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
|
||||||
|
'message/rfc822': ['.eml'],
|
||||||
|
'application/epub+zip': ['.epub'],
|
||||||
|
'image/heic': ['.heic'],
|
||||||
|
'text/html': ['.html'],
|
||||||
|
'image/jpeg': ['.jpeg', '.jpg'],
|
||||||
|
'image/png': ['.png'],
|
||||||
|
'application/vnd.ms-outlook': ['.msg'],
|
||||||
|
'application/vnd.oasis.opendocument.text': ['.odt'],
|
||||||
|
'text/x-org': ['.org'],
|
||||||
|
'application/pkcs7-signature': ['.p7s'],
|
||||||
|
'application/pdf': ['.pdf'],
|
||||||
|
'application/vnd.ms-powerpoint': ['.ppt'],
|
||||||
|
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
|
||||||
|
'text/x-rst': ['.rst'],
|
||||||
|
'application/rtf': ['.rtf'],
|
||||||
|
'image/tiff': ['.tiff'],
|
||||||
|
'text/tab-separated-values': ['.tsv'],
|
||||||
|
'application/vnd.ms-excel': ['.xls'],
|
||||||
|
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
|
||||||
|
'application/xml': ['.xml'],
|
||||||
|
// Audio files (always supported)
|
||||||
|
...audioFileTypes,
|
||||||
|
};
|
||||||
}
|
}
|
||||||
: {
|
};
|
||||||
// Unstructured supported file types
|
|
||||||
'image/bmp': ['.bmp'],
|
const acceptedFileTypes = getAcceptedFileTypes();
|
||||||
'text/csv': ['.csv'],
|
|
||||||
'application/msword': ['.doc'],
|
|
||||||
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': ['.docx'],
|
|
||||||
'message/rfc822': ['.eml'],
|
|
||||||
'application/epub+zip': ['.epub'],
|
|
||||||
'image/heic': ['.heic'],
|
|
||||||
'text/html': ['.html'],
|
|
||||||
'image/jpeg': ['.jpeg', '.jpg'],
|
|
||||||
'image/png': ['.png'],
|
|
||||||
'application/vnd.ms-outlook': ['.msg'],
|
|
||||||
'application/vnd.oasis.opendocument.text': ['.odt'],
|
|
||||||
'text/x-org': ['.org'],
|
|
||||||
'application/pkcs7-signature': ['.p7s'],
|
|
||||||
'application/pdf': ['.pdf'],
|
|
||||||
'application/vnd.ms-powerpoint': ['.ppt'],
|
|
||||||
'application/vnd.openxmlformats-officedocument.presentationml.presentation': ['.pptx'],
|
|
||||||
'text/x-rst': ['.rst'],
|
|
||||||
'application/rtf': ['.rtf'],
|
|
||||||
'image/tiff': ['.tiff'],
|
|
||||||
'text/tab-separated-values': ['.tsv'],
|
|
||||||
'application/vnd.ms-excel': ['.xls'],
|
|
||||||
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': ['.xlsx'],
|
|
||||||
'application/xml': ['.xml'],
|
|
||||||
// Audio files (always supported)
|
|
||||||
...audioFileTypes,
|
|
||||||
};
|
|
||||||
|
|
||||||
const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort()
|
const supportedExtensions = Array.from(new Set(Object.values(acceptedFileTypes).flat())).sort()
|
||||||
|
|
||||||
|
|
Loading…
Add table
Reference in a new issue