diff --git a/Dockerfile b/Dockerfile index c670ef8..68c0c55 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,22 +8,28 @@ RUN pip install flask elasticsearch ebooklib beautifulsoup4 PyPDF2 pytz # Create books directory with proper permissions RUN mkdir -p /books && chmod 777 /books +# Create project directory structure +RUN mkdir -p src/api/static src/api/templates src/core tests/unit + # Copy the API code and static files -COPY src/api/app.py . -COPY src/api/static /app/static -COPY src/api/templates /app/templates +COPY src/api/app.py src/api/ +COPY src/api/static src/api/static +COPY src/api/templates src/api/templates # Expose the API port EXPOSE 5000 # Copy the indexing script -COPY src/core/index.py . +COPY src/core/index.py src/core/ -# Copy the test file -COPY tests/unit/test_app.py . +# Copy test files +COPY tests/unit/ tests/unit/ # Add a dummy file to invalidate cache ADD dummy.txt . +# Set Python path +ENV PYTHONPATH=/app/src + # Command to run the API -CMD ["python", "app.py"] \ No newline at end of file +CMD ["python", "src/api/app.py"] \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index 1678f3c..e52e063 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -36,4 +36,12 @@ services: test: ["CMD", "curl", "-f", "http://localhost:9200"] interval: 30s timeout: 10s - retries: 5 \ No newline at end of file + retries: 5 + + booksearch_tests: + build: . + container_name: booksearch_tests + volumes: + - ./test_data:/app/test_data + - ./tests:/app/tests + command: sh -c "cd /app && PYTHONPATH=/app python -m unittest tests.unit.test_epub_extraction -v" diff --git a/scripts/run_tests.bat b/scripts/run_tests.bat deleted file mode 100644 index cacacf2..0000000 --- a/scripts/run_tests.bat +++ /dev/null @@ -1,37 +0,0 @@ -@echo off -echo Setting up test environment... - -echo Checking Python version... -python --version -if errorlevel 1 ( - echo Python not found. Please install Python 3.10+ first. - pause - exit /b 1 -) - -echo Installing Python dependencies... -python -m pip install --upgrade pip --user -if errorlevel 1 ( - echo Failed to upgrade pip - pause - exit /b 1 -) - -pip install -r requirements.txt --user -if errorlevel 1 ( - echo Failed to install dependencies - pause - exit /b 1 -) - -echo Running EPUB viewer tests... -cd api -python -m pytest test_epub_viewer.py -v -if errorlevel 1 ( - echo Some tests failed - pause - exit /b 1 -) - -echo All tests completed successfully! -pause \ No newline at end of file diff --git a/scripts/run_tests.sh b/scripts/run_tests.sh index 0aed9f0..2a13f74 100644 --- a/scripts/run_tests.sh +++ b/scripts/run_tests.sh @@ -1,29 +1,13 @@ -#!/bin/bash - -echo "Setting up test environment..." - -echo "Checking Python version..." -python3 --version || { - echo "Python 3 not found. Please install Python 3.10+ first." - exit 1 -} - -echo "Installing Python dependencies..." -python3 -m pip install --upgrade pip --user || { - echo "Failed to upgrade pip" - exit 1 -} - -pip3 install -r requirements.txt --user || { - echo "Failed to install dependencies" - exit 1 -} - -echo "Running EPUB viewer tests..." -cd api -python3 -m pytest test_epub_viewer.py -v || { - echo "Some tests failed" - exit 1 -} - -echo "All tests completed successfully!" \ No newline at end of file +#!/bin/bash + +# Get absolute path to project root +PROJECT_ROOT=$(dirname $(dirname $(realpath $0))) + +# Run the test container +docker-compose -f $PROJECT_ROOT/docker-compose.yml up -d booksearch_tests + +# Follow the logs +docker logs -f booksearch_tests + +# Clean up +docker-compose -f $PROJECT_ROOT/docker-compose.yml down diff --git a/src/core/index.py b/src/core/index.py index 3b41a21..e82b96c 100644 --- a/src/core/index.py +++ b/src/core/index.py @@ -28,7 +28,8 @@ def create_index(): if not es.indices.exists(index=INDEX_NAME): es.indices.create(index=INDEX_NAME) -def extract_text_from_epub(epub_path): +# TODO: remove old version? +def extract_text_from_epub_old(epub_path): book = epub.read_epub(epub_path) text = '' for item in book.get_items(): @@ -37,6 +38,214 @@ def extract_text_from_epub(epub_path): text += soup.get_text() return text +# TODO: remove old version? +def extract_text_from_epub_interim(epub_path): + text = '' + try: + try: + book = epub.read_epub(epub_path) + except Exception as e: + with progress_lock: + indexing_progress['errors'].append(f"EPUB structure error in {epub_path}: {str(e)}") + return text # Return empty if we can't even read the EPUB + + for item in book.get_items(): + current_item_id = getattr(item, 'id', 'no_id') + try: + # Attempt to process all text-containing formats + if item.media_type in ['application/xhtml+xml', 'text/html', 'application/html']: + try: + content = item.get_content() + except Exception as e: + with progress_lock: + indexing_progress['errors'].append( + f"Content extraction failed in {epub_path} item {current_item_id}: {str(e)}") + continue + + try: + soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8') + item_text = soup.get_text(separator='\n', strip=True) + text += f"\n{item_text}\n" + except Exception as e: + with progress_lock: + indexing_progress['errors'].append( + f"HTML parsing failed in {epub_path} item {current_item_id}: {str(e)}") + # Fallback to raw content extraction + text += f"\n{content.decode('utf-8', errors='replace')}\n" + + except Exception as e: + with progress_lock: + indexing_progress['errors'].append( + f"Unexpected error processing {epub_path} item {current_item_id}: {str(e)}") + continue + + except Exception as e: + with progress_lock: + indexing_progress['errors'].append(f"Critical failure processing {epub_path}: {str(e)}") + + return text + +# TODO: remove old version? +def extract_text_from_epub_interim2(epub_path, progress_lock=None, indexing_progress=None): + """Extract text from EPUB using generator stabilization.""" + text = '' + errors = [] + info_messages = [] + + def add_error(msg): + errors.append(msg) + if indexing_progress and progress_lock: + with progress_lock: + indexing_progress['errors'].append(msg) + + # Validate file existence + if not os.path.exists(epub_path): + add_error(f"File not found: {epub_path}") + return '', errors + + try: + # --- EPUB Initialization --- + try: + book = epub.read_epub(epub_path) + info_messages.append(f"[MAIN] Processing EPUB: {os.path.basename(epub_path)}") + except Exception as e: + add_error(f"EPUB read failure: {str(e)}") + return '', errors + + # --- Metadata Extraction --- + md = lambda ns,name: book.get_metadata(ns, name)[0][0] if book.get_metadata(ns, name) else 'N/A' + info_messages.extend([ + "[METADATA]", + f"Title: {md('DC', 'title')}", + f"Creator: {md('DC', 'creator')}", + f"Language: {md('DC', 'language')}", + f"Identifier: {md('DC', 'identifier')}" + ]) + + # --- Critical Section: Resolve Generator Early --- + try: + raw_items = book.get_items() + item_cache = list(raw_items) # Convert generator to list IMMEDIATELY + item_map = {item.id: item for item in item_cache} + info_messages.append(f"[STRUCTURE] Found {len(item_cache)} items in manifest") + except Exception as e: + add_error(f"Item collection failed: {str(e)}") + return '', errors + + # --- Spine Reconciliation --- + spine_items = [] + try: + spine_ids = [s[0] for s in book.spine] + spine_items = [item_map.get(sid) for sid in spine_ids] + missing = len([sid for sid in spine_ids if sid not in item_map]) + + info_messages.append( + f"[SPINE] Contains {len(spine_ids)} entries " + f"({len(spine_items)-missing} valid, {missing} missing)" + ) + except Exception as e: + add_error(f"Spine analysis failed: {str(e)}") + + # --- Content Processing --- + content_blocks = [] + processed_items = 0 + + for item in item_cache: # Use stabilized list + if item.media_type not in {'application/xhtml+xml', 'text/html'}: + continue + + try: + # Filter items safely + if item.size == 0: + info_messages.append(f"[SKIP] Empty item: {item.id}") + continue + + try_context = f"Item {item.id} ({item.media_type})" + + # Content decoding + try: + content = item.get_content().decode('utf-8-sig') # Handle BOM + except UnicodeError: + content = item.get_content().decode('latin-1', errors='replace') + + # Text extraction + soup = BeautifulSoup(content, 'html.parser') + text = soup.get_text(separator='\n', strip=True) + + content_blocks.append(text) + processed_items += 1 + info_messages.append(f"[PROCESSED] {try_context} ({len(text)} chars)") + + except Exception as e: + add_error(f"Processing failed for {item.id}: {str(e)}") + + # --- Final Assembly --- + info_messages.append( + f"[STATS] Processed {processed_items}/{len(item_cache)} items " + f"({len(content_blocks)} valid blocks)" + ) + + full_text = '\n'.join(info_messages) + '\n\n' + '\n'.join(content_blocks) + return full_text.strip(), errors + + except Exception as e: + add_error(f"Critical failure: {str(e)}") + return '', errors + +def extract_text_from_epub(epub_path): + text = '' + try: + try: + book = epub.read_epub(epub_path) + except Exception as e: + with progress_lock: + indexing_progress['errors'].append(f"EPUB structure error in {epub_path}: {str(e)}") + return text + + # Collect all items first to handle generator issues + collected_items = [] + try: + for item in book.get_items(): + collected_items.append(item) + except Exception as e: + with progress_lock: + indexing_progress['errors'].append(f"Item collection failed in {epub_path}: {str(e)}") + + for item in collected_items: + current_item_id = getattr(item, 'id', 'no_id') + try: + if item.media_type in ['application/xhtml+xml', 'text/html', 'application/html']: + try: + content = item.get_content() + except Exception as e: + with progress_lock: + indexing_progress['errors'].append( + f"Content extraction failed in {epub_path} item {current_item_id}: {str(e)}") + continue + + try: + soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8') + item_text = soup.get_text(separator='\n', strip=True) + text += f"\n{item_text}\n" + except Exception as e: + with progress_lock: + indexing_progress['errors'].append( + f"HTML parsing failed in {epub_path} item {current_item_id}: {str(e)}") + text += f"\n{content.decode('utf-8', errors='replace')}\n" + + except Exception as e: + with progress_lock: + indexing_progress['errors'].append( + f"Unexpected error processing {epub_path} item {current_item_id}: {str(e)}") + continue + + except Exception as e: + with progress_lock: + indexing_progress['errors'].append(f"Critical failure processing {epub_path}: {str(e)}") + + return text + + def extract_text_from_pdf(pdf_path): text = '' with open(pdf_path, 'rb') as pdf_file: diff --git a/test.sh b/test.sh new file mode 100644 index 0000000..1175376 --- /dev/null +++ b/test.sh @@ -0,0 +1,2 @@ +#!/bin/bash +echo test \ No newline at end of file diff --git a/tests/unit/test_epub_extraction.py b/tests/unit/test_epub_extraction.py new file mode 100644 index 0000000..6406639 --- /dev/null +++ b/tests/unit/test_epub_extraction.py @@ -0,0 +1,37 @@ +import unittest +import os +from src.core.index import extract_text_from_epub + +class TestEPUBExtraction(unittest.TestCase): + def setUp(self): + self.test_data_dir = os.path.join(os.path.dirname(__file__), '../../test_data') + self.epub_files = [ + os.path.join(self.test_data_dir, f) + for f in os.listdir(self.test_data_dir) + if f.endswith('.epub') + ] + self.invalid_file = os.path.join(self.test_data_dir, 'nonexistent.epub') + + def test_extract_text_from_all_epubs(self): + """Test text extraction from all EPUB files in test_data""" + for epub_path in self.epub_files: + with self.subTest(epub_file=os.path.basename(epub_path)): + text = extract_text_from_epub(epub_path) + self.assertIsInstance(text, str) + self.assertGreater(len(text), 0, + f"Extracted text should not be empty for {epub_path}") + + def test_extract_text_from_invalid_file(self): + # Test error handling for non-existent file + text = extract_text_from_epub(self.invalid_file) + self.assertEqual(text, '') + + def test_empty_file_handling(self): + # Create a temporary empty EPUB file + import tempfile + with tempfile.NamedTemporaryFile(suffix='.epub') as temp_epub: + text = extract_text_from_epub(temp_epub.name) + self.assertEqual(text, '') + +if __name__ == '__main__': + unittest.main() \ No newline at end of file