fixed tests

2025-04-02 03:57:52 +00:00 · 2025-04-02 03:57:52 +00:00 · 80d365c859
commit 80d365c859
parent 5ef2ca9ad0
7 changed files with 284 additions and 75 deletions
--- a/20
+++ b/20
@ -8,22 +8,28 @@ RUN pip install flask elasticsearch ebooklib beautifulsoup4 PyPDF2 pytz
 # Create books directory with proper permissions
 RUN mkdir -p /books && chmod 777 /books
 # Create project directory structure
 RUN mkdir -p src/api/static src/api/templates src/core tests/unit
 # Copy the API code and static files
-COPY src/api/app.py .
+COPY src/api/app.py src/api/
-COPY src/api/static /app/static
+COPY src/api/static src/api/static
-COPY src/api/templates /app/templates
+COPY src/api/templates src/api/templates
 # Expose the API port
 EXPOSE 5000
 # Copy the indexing script
-COPY src/core/index.py .
+COPY src/core/index.py src/core/
-# Copy the test file
+# Copy test files
-COPY tests/unit/test_app.py .
+COPY tests/unit/ tests/unit/
 # Add a dummy file to invalidate cache
 ADD dummy.txt .
 # Set Python path
 ENV PYTHONPATH=/app/src
 # Command to run the API
-CMD ["python", "app.py"]
+CMD ["python", "src/api/app.py"]
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -36,4 +36,12 @@ services:
      test: ["CMD", "curl", "-f", "http://localhost:9200"]
      interval: 30s
      timeout: 10s
-      retries: 5
+      retries: 5
  booksearch_tests:
    build: .
    container_name: booksearch_tests
    volumes:
      - ./test_data:/app/test_data
      - ./tests:/app/tests
    command: sh -c "cd /app && PYTHONPATH=/app python -m unittest tests.unit.test_epub_extraction -v"
--- a/scripts/run_tests.bat
+++ b/scripts/run_tests.bat
@ -1,37 +0,0 @@
@echo off
 echo Setting up test environment...
 echo Checking Python version...
 python --version
 if errorlevel 1 (
    echo Python not found. Please install Python 3.10+ first.
    pause
    exit /b 1
 )
 echo Installing Python dependencies...
 python -m pip install --upgrade pip --user
 if errorlevel 1 (
    echo Failed to upgrade pip
    pause
    exit /b 1
 )
 pip install -r requirements.txt --user
 if errorlevel 1 (
    echo Failed to install dependencies
    pause
    exit /b 1
 )
 echo Running EPUB viewer tests...
 cd api
 python -m pytest test_epub_viewer.py -v
 if errorlevel 1 (
    echo Some tests failed
    pause
    exit /b 1
 )
 echo All tests completed successfully!
 pause
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@ -1,29 +1,13 @@
-#!/bin/bash
+#!/bin/bash
-
+
-echo "Setting up test environment..."
+# Get absolute path to project root
-
+PROJECT_ROOT=$(dirname $(dirname $(realpath $0)))
-echo "Checking Python version..."
+
-python3 --version || {
+# Run the test container
-    echo "Python 3 not found. Please install Python 3.10+ first."
+docker-compose -f $PROJECT_ROOT/docker-compose.yml up -d booksearch_tests
-    exit 1
+
-}
+# Follow the logs
-
+docker logs -f booksearch_tests
-echo "Installing Python dependencies..."
+
-python3 -m pip install --upgrade pip --user || {
+# Clean up
-    echo "Failed to upgrade pip"
+docker-compose -f $PROJECT_ROOT/docker-compose.yml down
    exit 1
 }
 pip3 install -r requirements.txt --user || {
    echo "Failed to install dependencies" 
    exit 1
 }
 echo "Running EPUB viewer tests..."
 cd api
 python3 -m pytest test_epub_viewer.py -v || {
    echo "Some tests failed"
    exit 1
 }
 echo "All tests completed successfully!"
--- a/src/core/index.py
+++ b/src/core/index.py
@ -28,7 +28,8 @@ def create_index():
    if not es.indices.exists(index=INDEX_NAME):
        es.indices.create(index=INDEX_NAME)
-def extract_text_from_epub(epub_path):
+# TODO: remove old version?
 def extract_text_from_epub_old(epub_path):
    book = epub.read_epub(epub_path)
    text = ''
    for item in book.get_items():
@ -37,6 +38,214 @@ def extract_text_from_epub(epub_path):
            text += soup.get_text()
    return text
 # TODO: remove old version?
 def extract_text_from_epub_interim(epub_path):
    text = ''
    try:
        try:
            book = epub.read_epub(epub_path)
        except Exception as e:
            with progress_lock:
                indexing_progress['errors'].append(f"EPUB structure error in {epub_path}: {str(e)}")
            return text  # Return empty if we can't even read the EPUB
        for item in book.get_items():
            current_item_id = getattr(item, 'id', 'no_id')
            try:
                # Attempt to process all text-containing formats
                if item.media_type in ['application/xhtml+xml', 'text/html', 'application/html']:
                    try:
                        content = item.get_content()
                    except Exception as e:
                        with progress_lock:
                            indexing_progress['errors'].append(
                                f"Content extraction failed in {epub_path} item {current_item_id}: {str(e)}")
                        continue
                    try:
                        soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
                        item_text = soup.get_text(separator='\n', strip=True)
                        text += f"\n{item_text}\n"
                    except Exception as e:
                        with progress_lock:
                            indexing_progress['errors'].append(
                                f"HTML parsing failed in {epub_path} item {current_item_id}: {str(e)}")
                        # Fallback to raw content extraction
                        text += f"\n{content.decode('utf-8', errors='replace')}\n"
            except Exception as e:
                with progress_lock:
                    indexing_progress['errors'].append(
                        f"Unexpected error processing {epub_path} item {current_item_id}: {str(e)}")
                continue
    except Exception as e:
        with progress_lock:
            indexing_progress['errors'].append(f"Critical failure processing {epub_path}: {str(e)}")
    return text
 # TODO: remove old version?
 def extract_text_from_epub_interim2(epub_path, progress_lock=None, indexing_progress=None):
    """Extract text from EPUB using generator stabilization."""
    text = ''
    errors = []
    info_messages = []
    def add_error(msg):
        errors.append(msg)
        if indexing_progress and progress_lock:
            with progress_lock:
                indexing_progress['errors'].append(msg)
    # Validate file existence
    if not os.path.exists(epub_path):
        add_error(f"File not found: {epub_path}")
        return '', errors
    try:
        # --- EPUB Initialization ---
        try:
            book = epub.read_epub(epub_path)
            info_messages.append(f"[MAIN] Processing EPUB: {os.path.basename(epub_path)}")
        except Exception as e:
            add_error(f"EPUB read failure: {str(e)}")
            return '', errors
        # --- Metadata Extraction ---
        md = lambda ns,name: book.get_metadata(ns, name)[0][0] if book.get_metadata(ns, name) else 'N/A'
        info_messages.extend([
            "[METADATA]",
            f"Title: {md('DC', 'title')}",
            f"Creator: {md('DC', 'creator')}",
            f"Language: {md('DC', 'language')}",
            f"Identifier: {md('DC', 'identifier')}"
        ])
        # --- Critical Section: Resolve Generator Early ---
        try:
            raw_items = book.get_items()
            item_cache = list(raw_items)  # Convert generator to list IMMEDIATELY
            item_map = {item.id: item for item in item_cache}
            info_messages.append(f"[STRUCTURE] Found {len(item_cache)} items in manifest")
        except Exception as e:
            add_error(f"Item collection failed: {str(e)}")
            return '', errors
        # --- Spine Reconciliation ---
        spine_items = []
        try:
            spine_ids = [s[0] for s in book.spine]
            spine_items = [item_map.get(sid) for sid in spine_ids]
            missing = len([sid for sid in spine_ids if sid not in item_map])
            info_messages.append(
                f"[SPINE] Contains {len(spine_ids)} entries "
                f"({len(spine_items)-missing} valid, {missing} missing)"
            )
        except Exception as e:
            add_error(f"Spine analysis failed: {str(e)}")
        # --- Content Processing ---
        content_blocks = []
        processed_items = 0
        for item in item_cache:  # Use stabilized list
            if item.media_type not in {'application/xhtml+xml', 'text/html'}:
                continue
            try:
                # Filter items safely
                if item.size == 0:
                    info_messages.append(f"[SKIP] Empty item: {item.id}")
                    continue
                try_context = f"Item {item.id} ({item.media_type})"
                # Content decoding
                try:
                    content = item.get_content().decode('utf-8-sig')  # Handle BOM
                except UnicodeError:
                    content = item.get_content().decode('latin-1', errors='replace')
                # Text extraction
                soup = BeautifulSoup(content, 'html.parser')
                text = soup.get_text(separator='\n', strip=True)
                content_blocks.append(text)
                processed_items += 1
                info_messages.append(f"[PROCESSED] {try_context} ({len(text)} chars)")
            except Exception as e:
                add_error(f"Processing failed for {item.id}: {str(e)}")
        # --- Final Assembly ---
        info_messages.append(
            f"[STATS] Processed {processed_items}/{len(item_cache)} items "
            f"({len(content_blocks)} valid blocks)"
        )
        full_text = '\n'.join(info_messages) + '\n\n' + '\n'.join(content_blocks)
        return full_text.strip(), errors
    except Exception as e:
        add_error(f"Critical failure: {str(e)}")
        return '', errors
 def extract_text_from_epub(epub_path):
    text = ''
    try:
        try:
            book = epub.read_epub(epub_path)
        except Exception as e:
            with progress_lock:
                indexing_progress['errors'].append(f"EPUB structure error in {epub_path}: {str(e)}")
            return text
        # Collect all items first to handle generator issues
        collected_items = []
        try:
            for item in book.get_items():
                collected_items.append(item)
        except Exception as e:
            with progress_lock:
                indexing_progress['errors'].append(f"Item collection failed in {epub_path}: {str(e)}")
        for item in collected_items:
            current_item_id = getattr(item, 'id', 'no_id')
            try:
                if item.media_type in ['application/xhtml+xml', 'text/html', 'application/html']:
                    try:
                        content = item.get_content()
                    except Exception as e:
                        with progress_lock:
                            indexing_progress['errors'].append(
                                f"Content extraction failed in {epub_path} item {current_item_id}: {str(e)}")
                        continue
                    try:
                        soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
                        item_text = soup.get_text(separator='\n', strip=True)
                        text += f"\n{item_text}\n"
                    except Exception as e:
                        with progress_lock:
                            indexing_progress['errors'].append(
                                f"HTML parsing failed in {epub_path} item {current_item_id}: {str(e)}")
                        text += f"\n{content.decode('utf-8', errors='replace')}\n"
            except Exception as e:
                with progress_lock:
                    indexing_progress['errors'].append(
                        f"Unexpected error processing {epub_path} item {current_item_id}: {str(e)}")
                continue
    except Exception as e:
        with progress_lock:
            indexing_progress['errors'].append(f"Critical failure processing {epub_path}: {str(e)}")
    return text
 def extract_text_from_pdf(pdf_path):
    text = ''
    with open(pdf_path, 'rb') as pdf_file:
--- a/test.sh
+++ b/test.sh
@ -0,0 +1,2 @@
 #!/bin/bash
 echo test
--- a/tests/unit/test_epub_extraction.py
+++ b/tests/unit/test_epub_extraction.py
@ -0,0 +1,37 @@
 import unittest
 import os
 from src.core.index import extract_text_from_epub
 class TestEPUBExtraction(unittest.TestCase):
    def setUp(self):
        self.test_data_dir = os.path.join(os.path.dirname(__file__), '../../test_data')
        self.epub_files = [
            os.path.join(self.test_data_dir, f)
            for f in os.listdir(self.test_data_dir)
            if f.endswith('.epub')
        ]
        self.invalid_file = os.path.join(self.test_data_dir, 'nonexistent.epub')
    def test_extract_text_from_all_epubs(self):
        """Test text extraction from all EPUB files in test_data"""
        for epub_path in self.epub_files:
            with self.subTest(epub_file=os.path.basename(epub_path)):
                text = extract_text_from_epub(epub_path)
                self.assertIsInstance(text, str)
                self.assertGreater(len(text), 0,
                                 f"Extracted text should not be empty for {epub_path}")
    def test_extract_text_from_invalid_file(self):
        # Test error handling for non-existent file
        text = extract_text_from_epub(self.invalid_file)
        self.assertEqual(text, '')
    def test_empty_file_handling(self):
        # Create a temporary empty EPUB file
        import tempfile
        with tempfile.NamedTemporaryFile(suffix='.epub') as temp_epub:
            text = extract_text_from_epub(temp_epub.name)
            self.assertEqual(text, '')
 if __name__ == '__main__':
    unittest.main()