fixed tests

2025-04-08 04:59:10 +00:00 · 2025-04-02 03:57:52 +00:00 · 2025-04-02 03:57:52 +00:00 · 80d365c859
commit 80d365c859
parent 5ef2ca9ad0
7 changed files with 284 additions and 75 deletions
--- a/20
+++ b/20
@ -8,22 +8,28 @@ RUN pip install flask elasticsearch ebooklib beautifulsoup4 PyPDF2 pytz
 # Create books directory with proper permissions
 RUN mkdir -p /books && chmod 777 /books

+# Create project directory structure
+RUN mkdir -p src/api/static src/api/templates src/core tests/unit
+
 # Copy the API code and static files
-COPY src/api/app.py .
-COPY src/api/static /app/static
-COPY src/api/templates /app/templates
+COPY src/api/app.py src/api/
+COPY src/api/static src/api/static
+COPY src/api/templates src/api/templates

 # Expose the API port
 EXPOSE 5000

 # Copy the indexing script
-COPY src/core/index.py .
+COPY src/core/index.py src/core/

-# Copy the test file
-COPY tests/unit/test_app.py .
+# Copy test files
+COPY tests/unit/ tests/unit/

 # Add a dummy file to invalidate cache
 ADD dummy.txt .

+# Set Python path
+ENV PYTHONPATH=/app/src
+
 # Command to run the API
-CMD ["python", "app.py"]
+CMD ["python", "src/api/app.py"]
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -36,4 +36,12 @@ services:
      test: ["CMD", "curl", "-f", "http://localhost:9200"]
      interval: 30s
      timeout: 10s
-      retries: 5
+      retries: 5
+
+  booksearch_tests:
+    build: .
+    container_name: booksearch_tests
+    volumes:
+      - ./test_data:/app/test_data
+      - ./tests:/app/tests
+    command: sh -c "cd /app && PYTHONPATH=/app python -m unittest tests.unit.test_epub_extraction -v"
--- a/scripts/run_tests.bat
+++ b/scripts/run_tests.bat
@ -1,37 +0,0 @@
-@echo off
-echo Setting up test environment...
-
-echo Checking Python version...
-python --version
-if errorlevel 1 (
-    echo Python not found. Please install Python 3.10+ first.
-    pause
-    exit /b 1
-)
-
-echo Installing Python dependencies...
-python -m pip install --upgrade pip --user
-if errorlevel 1 (
-    echo Failed to upgrade pip
-    pause
-    exit /b 1
-)
-
-pip install -r requirements.txt --user
-if errorlevel 1 (
-    echo Failed to install dependencies
-    pause
-    exit /b 1
-)
-
-echo Running EPUB viewer tests...
-cd api
-python -m pytest test_epub_viewer.py -v
-if errorlevel 1 (
-    echo Some tests failed
-    pause
-    exit /b 1
-)
-
-echo All tests completed successfully!
-pause
--- a/scripts/run_tests.sh
+++ b/scripts/run_tests.sh
@ -1,29 +1,13 @@
-#!/bin/bash
-
-echo "Setting up test environment..."
-
-echo "Checking Python version..."
-python3 --version || {
-    echo "Python 3 not found. Please install Python 3.10+ first."
-    exit 1
-}
-
-echo "Installing Python dependencies..."
-python3 -m pip install --upgrade pip --user || {
-    echo "Failed to upgrade pip"
-    exit 1
-}
-
-pip3 install -r requirements.txt --user || {
-    echo "Failed to install dependencies" 
-    exit 1
-}
-
-echo "Running EPUB viewer tests..."
-cd api
-python3 -m pytest test_epub_viewer.py -v || {
-    echo "Some tests failed"
-    exit 1
-}
-
-echo "All tests completed successfully!"
+#!/bin/bash
+
+# Get absolute path to project root
+PROJECT_ROOT=$(dirname $(dirname $(realpath $0)))
+
+# Run the test container
+docker-compose -f $PROJECT_ROOT/docker-compose.yml up -d booksearch_tests
+
+# Follow the logs
+docker logs -f booksearch_tests
+
+# Clean up
+docker-compose -f $PROJECT_ROOT/docker-compose.yml down
--- a/src/core/index.py
+++ b/src/core/index.py
@ -28,7 +28,8 @@ def create_index():
    if not es.indices.exists(index=INDEX_NAME):
        es.indices.create(index=INDEX_NAME)

-def extract_text_from_epub(epub_path):
+# TODO: remove old version?
+def extract_text_from_epub_old(epub_path):
    book = epub.read_epub(epub_path)
    text = ''
    for item in book.get_items():
@ -37,6 +38,214 @@ def extract_text_from_epub(epub_path):
            text += soup.get_text()
    return text

+# TODO: remove old version?
+def extract_text_from_epub_interim(epub_path):
+    text = ''
+    try:
+        try:
+            book = epub.read_epub(epub_path)
+        except Exception as e:
+            with progress_lock:
+                indexing_progress['errors'].append(f"EPUB structure error in {epub_path}: {str(e)}")
+            return text  # Return empty if we can't even read the EPUB
+
+        for item in book.get_items():
+            current_item_id = getattr(item, 'id', 'no_id')
+            try:
+                # Attempt to process all text-containing formats
+                if item.media_type in ['application/xhtml+xml', 'text/html', 'application/html']:
+                    try:
+                        content = item.get_content()
+                    except Exception as e:
+                        with progress_lock:
+                            indexing_progress['errors'].append(
+                                f"Content extraction failed in {epub_path} item {current_item_id}: {str(e)}")
+                        continue
+
+                    try:
+                        soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
+                        item_text = soup.get_text(separator='\n', strip=True)
+                        text += f"\n{item_text}\n"
+                    except Exception as e:
+                        with progress_lock:
+                            indexing_progress['errors'].append(
+                                f"HTML parsing failed in {epub_path} item {current_item_id}: {str(e)}")
+                        # Fallback to raw content extraction
+                        text += f"\n{content.decode('utf-8', errors='replace')}\n"
+
+            except Exception as e:
+                with progress_lock:
+                    indexing_progress['errors'].append(
+                        f"Unexpected error processing {epub_path} item {current_item_id}: {str(e)}")
+                continue
+
+    except Exception as e:
+        with progress_lock:
+            indexing_progress['errors'].append(f"Critical failure processing {epub_path}: {str(e)}")
+
+    return text
+
+# TODO: remove old version?
+def extract_text_from_epub_interim2(epub_path, progress_lock=None, indexing_progress=None):
+    """Extract text from EPUB using generator stabilization."""
+    text = ''
+    errors = []
+    info_messages = []
+    
+    def add_error(msg):
+        errors.append(msg)
+        if indexing_progress and progress_lock:
+            with progress_lock:
+                indexing_progress['errors'].append(msg)
+
+    # Validate file existence
+    if not os.path.exists(epub_path):
+        add_error(f"File not found: {epub_path}")
+        return '', errors
+
+    try:
+        # --- EPUB Initialization ---
+        try:
+            book = epub.read_epub(epub_path)
+            info_messages.append(f"[MAIN] Processing EPUB: {os.path.basename(epub_path)}")
+        except Exception as e:
+            add_error(f"EPUB read failure: {str(e)}")
+            return '', errors
+
+        # --- Metadata Extraction ---
+        md = lambda ns,name: book.get_metadata(ns, name)[0][0] if book.get_metadata(ns, name) else 'N/A'
+        info_messages.extend([
+            "[METADATA]",
+            f"Title: {md('DC', 'title')}",
+            f"Creator: {md('DC', 'creator')}",
+            f"Language: {md('DC', 'language')}",
+            f"Identifier: {md('DC', 'identifier')}"
+        ])
+
+        # --- Critical Section: Resolve Generator Early ---
+        try:
+            raw_items = book.get_items()
+            item_cache = list(raw_items)  # Convert generator to list IMMEDIATELY
+            item_map = {item.id: item for item in item_cache}
+            info_messages.append(f"[STRUCTURE] Found {len(item_cache)} items in manifest")
+        except Exception as e:
+            add_error(f"Item collection failed: {str(e)}")
+            return '', errors
+
+        # --- Spine Reconciliation ---
+        spine_items = []
+        try:
+            spine_ids = [s[0] for s in book.spine]
+            spine_items = [item_map.get(sid) for sid in spine_ids]
+            missing = len([sid for sid in spine_ids if sid not in item_map])
+            
+            info_messages.append(
+                f"[SPINE] Contains {len(spine_ids)} entries "
+                f"({len(spine_items)-missing} valid, {missing} missing)"
+            )
+        except Exception as e:
+            add_error(f"Spine analysis failed: {str(e)}")
+
+        # --- Content Processing ---
+        content_blocks = []
+        processed_items = 0
+        
+        for item in item_cache:  # Use stabilized list
+            if item.media_type not in {'application/xhtml+xml', 'text/html'}:
+                continue
+                
+            try:
+                # Filter items safely
+                if item.size == 0:
+                    info_messages.append(f"[SKIP] Empty item: {item.id}")
+                    continue
+
+                try_context = f"Item {item.id} ({item.media_type})"
+                
+                # Content decoding
+                try:
+                    content = item.get_content().decode('utf-8-sig')  # Handle BOM
+                except UnicodeError:
+                    content = item.get_content().decode('latin-1', errors='replace')
+
+                # Text extraction
+                soup = BeautifulSoup(content, 'html.parser')
+                text = soup.get_text(separator='\n', strip=True)
+                
+                content_blocks.append(text)
+                processed_items += 1
+                info_messages.append(f"[PROCESSED] {try_context} ({len(text)} chars)")
+
+            except Exception as e:
+                add_error(f"Processing failed for {item.id}: {str(e)}")
+
+        # --- Final Assembly ---
+        info_messages.append(
+            f"[STATS] Processed {processed_items}/{len(item_cache)} items "
+            f"({len(content_blocks)} valid blocks)"
+        )
+        
+        full_text = '\n'.join(info_messages) + '\n\n' + '\n'.join(content_blocks)
+        return full_text.strip(), errors
+
+    except Exception as e:
+        add_error(f"Critical failure: {str(e)}")
+        return '', errors
+
+def extract_text_from_epub(epub_path):
+    text = ''
+    try:
+        try:
+            book = epub.read_epub(epub_path)
+        except Exception as e:
+            with progress_lock:
+                indexing_progress['errors'].append(f"EPUB structure error in {epub_path}: {str(e)}")
+            return text
+
+        # Collect all items first to handle generator issues
+        collected_items = []
+        try:
+            for item in book.get_items():
+                collected_items.append(item)
+        except Exception as e:
+            with progress_lock:
+                indexing_progress['errors'].append(f"Item collection failed in {epub_path}: {str(e)}")
+
+        for item in collected_items:
+            current_item_id = getattr(item, 'id', 'no_id')
+            try:
+                if item.media_type in ['application/xhtml+xml', 'text/html', 'application/html']:
+                    try:
+                        content = item.get_content()
+                    except Exception as e:
+                        with progress_lock:
+                            indexing_progress['errors'].append(
+                                f"Content extraction failed in {epub_path} item {current_item_id}: {str(e)}")
+                        continue
+
+                    try:
+                        soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
+                        item_text = soup.get_text(separator='\n', strip=True)
+                        text += f"\n{item_text}\n"
+                    except Exception as e:
+                        with progress_lock:
+                            indexing_progress['errors'].append(
+                                f"HTML parsing failed in {epub_path} item {current_item_id}: {str(e)}")
+                        text += f"\n{content.decode('utf-8', errors='replace')}\n"
+
+            except Exception as e:
+                with progress_lock:
+                    indexing_progress['errors'].append(
+                        f"Unexpected error processing {epub_path} item {current_item_id}: {str(e)}")
+                continue
+
+    except Exception as e:
+        with progress_lock:
+            indexing_progress['errors'].append(f"Critical failure processing {epub_path}: {str(e)}")
+
+    return text
+
+
 def extract_text_from_pdf(pdf_path):
    text = ''
    with open(pdf_path, 'rb') as pdf_file:
--- a/test.sh
+++ b/test.sh
@ -0,0 +1,2 @@
+#!/bin/bash
+echo test
--- a/tests/unit/test_epub_extraction.py
+++ b/tests/unit/test_epub_extraction.py
@ -0,0 +1,37 @@
+import unittest
+import os
+from src.core.index import extract_text_from_epub
+
+class TestEPUBExtraction(unittest.TestCase):
+    def setUp(self):
+        self.test_data_dir = os.path.join(os.path.dirname(__file__), '../../test_data')
+        self.epub_files = [
+            os.path.join(self.test_data_dir, f)
+            for f in os.listdir(self.test_data_dir)
+            if f.endswith('.epub')
+        ]
+        self.invalid_file = os.path.join(self.test_data_dir, 'nonexistent.epub')
+
+    def test_extract_text_from_all_epubs(self):
+        """Test text extraction from all EPUB files in test_data"""
+        for epub_path in self.epub_files:
+            with self.subTest(epub_file=os.path.basename(epub_path)):
+                text = extract_text_from_epub(epub_path)
+                self.assertIsInstance(text, str)
+                self.assertGreater(len(text), 0,
+                                 f"Extracted text should not be empty for {epub_path}")
+
+    def test_extract_text_from_invalid_file(self):
+        # Test error handling for non-existent file
+        text = extract_text_from_epub(self.invalid_file)
+        self.assertEqual(text, '')
+
+    def test_empty_file_handling(self):
+        # Create a temporary empty EPUB file
+        import tempfile
+        with tempfile.NamedTemporaryFile(suffix='.epub') as temp_epub:
+            text = extract_text_from_epub(temp_epub.name)
+            self.assertEqual(text, '')
+
+if __name__ == '__main__':
+    unittest.main()