search2_chatgpt/backend/indexer.py

import os
import requests
from pdfminer.high_level import extract_text
from ebooklib import epub
from bs4 import BeautifulSoup
from dotenv import load_dotenv

load_dotenv()
FILES_DIR = os.getenv("LOCAL_STORAGE_PATH", "/mnt/storage")
SEARCH_ENGINE = "http://meilisearch:7700"
INDEX_NAME = "documents"

def extract_text_from_pdf(pdf_path):
    return extract_text(pdf_path)

def extract_text_from_epub(epub_path):
    book = epub.read_epub(epub_path)
    text = []
    for item in book.get_items():
        if item.get_type() == 9:
            soup = BeautifulSoup(item.content, "html.parser")
            text.append(soup.get_text())
    return "\n".join(text)

def index_files():
    docs = []
    for filename in os.listdir(FILES_DIR):
        file_path = os.path.join(FILES_DIR, filename)

        if filename.endswith(".txt"):
            with open(file_path, "r", encoding="utf-8") as f:
                content = f.read()
        elif filename.endswith(".pdf"):
            content = extract_text_from_pdf(file_path)
        elif filename.endswith(".epub"):
            content = extract_text_from_epub(file_path)
        else:
            continue

        docs.append({"id": filename, "content": content})

    if docs:
        requests.post(f"{SEARCH_ENGINE}/indexes/{INDEX_NAME}/documents", json=docs)
        print(f"Èíäåêñèðîâàíî {len(docs)} ôàéëîâ!")

if __name__ == "__main__":
    index_files()