search2_chatgpt/backend/indexer.py

47 lines
1.4 KiB
Python

import os
import requests
from pdfminer.high_level import extract_text
from ebooklib import epub
from bs4 import BeautifulSoup
from dotenv import load_dotenv
load_dotenv()
FILES_DIR = os.getenv("LOCAL_STORAGE_PATH", "/mnt/storage")
SEARCH_ENGINE = "http://meilisearch:7700"
INDEX_NAME = "documents"
def extract_text_from_pdf(pdf_path):
return extract_text(pdf_path)
def extract_text_from_epub(epub_path):
book = epub.read_epub(epub_path)
text = []
for item in book.get_items():
if item.get_type() == 9:
soup = BeautifulSoup(item.content, "html.parser")
text.append(soup.get_text())
return "\n".join(text)
def index_files():
docs = []
for filename in os.listdir(FILES_DIR):
file_path = os.path.join(FILES_DIR, filename)
if filename.endswith(".txt"):
with open(file_path, "r", encoding="utf-8") as f:
content = f.read()
elif filename.endswith(".pdf"):
content = extract_text_from_pdf(file_path)
elif filename.endswith(".epub"):
content = extract_text_from_epub(file_path)
else:
continue
docs.append({"id": filename, "content": content})
if docs:
requests.post(f"{SEARCH_ENGINE}/indexes/{INDEX_NAME}/documents", json=docs)
print(f"Èíäåêñèðîâàíî {len(docs)} ôàéëîâ!")
if __name__ == "__main__":
index_files()