fixed tests
This commit is contained in:
parent
5ef2ca9ad0
commit
80d365c859
7 changed files with 284 additions and 75 deletions
20
Dockerfile
20
Dockerfile
|
@ -8,22 +8,28 @@ RUN pip install flask elasticsearch ebooklib beautifulsoup4 PyPDF2 pytz
|
||||||
# Create books directory with proper permissions
|
# Create books directory with proper permissions
|
||||||
RUN mkdir -p /books && chmod 777 /books
|
RUN mkdir -p /books && chmod 777 /books
|
||||||
|
|
||||||
|
# Create project directory structure
|
||||||
|
RUN mkdir -p src/api/static src/api/templates src/core tests/unit
|
||||||
|
|
||||||
# Copy the API code and static files
|
# Copy the API code and static files
|
||||||
COPY src/api/app.py .
|
COPY src/api/app.py src/api/
|
||||||
COPY src/api/static /app/static
|
COPY src/api/static src/api/static
|
||||||
COPY src/api/templates /app/templates
|
COPY src/api/templates src/api/templates
|
||||||
|
|
||||||
# Expose the API port
|
# Expose the API port
|
||||||
EXPOSE 5000
|
EXPOSE 5000
|
||||||
|
|
||||||
# Copy the indexing script
|
# Copy the indexing script
|
||||||
COPY src/core/index.py .
|
COPY src/core/index.py src/core/
|
||||||
|
|
||||||
# Copy the test file
|
# Copy test files
|
||||||
COPY tests/unit/test_app.py .
|
COPY tests/unit/ tests/unit/
|
||||||
|
|
||||||
# Add a dummy file to invalidate cache
|
# Add a dummy file to invalidate cache
|
||||||
ADD dummy.txt .
|
ADD dummy.txt .
|
||||||
|
|
||||||
|
# Set Python path
|
||||||
|
ENV PYTHONPATH=/app/src
|
||||||
|
|
||||||
# Command to run the API
|
# Command to run the API
|
||||||
CMD ["python", "app.py"]
|
CMD ["python", "src/api/app.py"]
|
|
@ -36,4 +36,12 @@ services:
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:9200"]
|
test: ["CMD", "curl", "-f", "http://localhost:9200"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 5
|
retries: 5
|
||||||
|
|
||||||
|
booksearch_tests:
|
||||||
|
build: .
|
||||||
|
container_name: booksearch_tests
|
||||||
|
volumes:
|
||||||
|
- ./test_data:/app/test_data
|
||||||
|
- ./tests:/app/tests
|
||||||
|
command: sh -c "cd /app && PYTHONPATH=/app python -m unittest tests.unit.test_epub_extraction -v"
|
||||||
|
|
|
@ -1,37 +0,0 @@
|
||||||
@echo off
|
|
||||||
echo Setting up test environment...
|
|
||||||
|
|
||||||
echo Checking Python version...
|
|
||||||
python --version
|
|
||||||
if errorlevel 1 (
|
|
||||||
echo Python not found. Please install Python 3.10+ first.
|
|
||||||
pause
|
|
||||||
exit /b 1
|
|
||||||
)
|
|
||||||
|
|
||||||
echo Installing Python dependencies...
|
|
||||||
python -m pip install --upgrade pip --user
|
|
||||||
if errorlevel 1 (
|
|
||||||
echo Failed to upgrade pip
|
|
||||||
pause
|
|
||||||
exit /b 1
|
|
||||||
)
|
|
||||||
|
|
||||||
pip install -r requirements.txt --user
|
|
||||||
if errorlevel 1 (
|
|
||||||
echo Failed to install dependencies
|
|
||||||
pause
|
|
||||||
exit /b 1
|
|
||||||
)
|
|
||||||
|
|
||||||
echo Running EPUB viewer tests...
|
|
||||||
cd api
|
|
||||||
python -m pytest test_epub_viewer.py -v
|
|
||||||
if errorlevel 1 (
|
|
||||||
echo Some tests failed
|
|
||||||
pause
|
|
||||||
exit /b 1
|
|
||||||
)
|
|
||||||
|
|
||||||
echo All tests completed successfully!
|
|
||||||
pause
|
|
|
@ -1,29 +1,13 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
|
|
||||||
echo "Setting up test environment..."
|
# Get absolute path to project root
|
||||||
|
PROJECT_ROOT=$(dirname $(dirname $(realpath $0)))
|
||||||
echo "Checking Python version..."
|
|
||||||
python3 --version || {
|
# Run the test container
|
||||||
echo "Python 3 not found. Please install Python 3.10+ first."
|
docker-compose -f $PROJECT_ROOT/docker-compose.yml up -d booksearch_tests
|
||||||
exit 1
|
|
||||||
}
|
# Follow the logs
|
||||||
|
docker logs -f booksearch_tests
|
||||||
echo "Installing Python dependencies..."
|
|
||||||
python3 -m pip install --upgrade pip --user || {
|
# Clean up
|
||||||
echo "Failed to upgrade pip"
|
docker-compose -f $PROJECT_ROOT/docker-compose.yml down
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
pip3 install -r requirements.txt --user || {
|
|
||||||
echo "Failed to install dependencies"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
echo "Running EPUB viewer tests..."
|
|
||||||
cd api
|
|
||||||
python3 -m pytest test_epub_viewer.py -v || {
|
|
||||||
echo "Some tests failed"
|
|
||||||
exit 1
|
|
||||||
}
|
|
||||||
|
|
||||||
echo "All tests completed successfully!"
|
|
||||||
|
|
|
@ -28,7 +28,8 @@ def create_index():
|
||||||
if not es.indices.exists(index=INDEX_NAME):
|
if not es.indices.exists(index=INDEX_NAME):
|
||||||
es.indices.create(index=INDEX_NAME)
|
es.indices.create(index=INDEX_NAME)
|
||||||
|
|
||||||
def extract_text_from_epub(epub_path):
|
# TODO: remove old version?
|
||||||
|
def extract_text_from_epub_old(epub_path):
|
||||||
book = epub.read_epub(epub_path)
|
book = epub.read_epub(epub_path)
|
||||||
text = ''
|
text = ''
|
||||||
for item in book.get_items():
|
for item in book.get_items():
|
||||||
|
@ -37,6 +38,214 @@ def extract_text_from_epub(epub_path):
|
||||||
text += soup.get_text()
|
text += soup.get_text()
|
||||||
return text
|
return text
|
||||||
|
|
||||||
|
# TODO: remove old version?
|
||||||
|
def extract_text_from_epub_interim(epub_path):
|
||||||
|
text = ''
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
book = epub.read_epub(epub_path)
|
||||||
|
except Exception as e:
|
||||||
|
with progress_lock:
|
||||||
|
indexing_progress['errors'].append(f"EPUB structure error in {epub_path}: {str(e)}")
|
||||||
|
return text # Return empty if we can't even read the EPUB
|
||||||
|
|
||||||
|
for item in book.get_items():
|
||||||
|
current_item_id = getattr(item, 'id', 'no_id')
|
||||||
|
try:
|
||||||
|
# Attempt to process all text-containing formats
|
||||||
|
if item.media_type in ['application/xhtml+xml', 'text/html', 'application/html']:
|
||||||
|
try:
|
||||||
|
content = item.get_content()
|
||||||
|
except Exception as e:
|
||||||
|
with progress_lock:
|
||||||
|
indexing_progress['errors'].append(
|
||||||
|
f"Content extraction failed in {epub_path} item {current_item_id}: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
|
||||||
|
item_text = soup.get_text(separator='\n', strip=True)
|
||||||
|
text += f"\n{item_text}\n"
|
||||||
|
except Exception as e:
|
||||||
|
with progress_lock:
|
||||||
|
indexing_progress['errors'].append(
|
||||||
|
f"HTML parsing failed in {epub_path} item {current_item_id}: {str(e)}")
|
||||||
|
# Fallback to raw content extraction
|
||||||
|
text += f"\n{content.decode('utf-8', errors='replace')}\n"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
with progress_lock:
|
||||||
|
indexing_progress['errors'].append(
|
||||||
|
f"Unexpected error processing {epub_path} item {current_item_id}: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
with progress_lock:
|
||||||
|
indexing_progress['errors'].append(f"Critical failure processing {epub_path}: {str(e)}")
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
# TODO: remove old version?
|
||||||
|
def extract_text_from_epub_interim2(epub_path, progress_lock=None, indexing_progress=None):
|
||||||
|
"""Extract text from EPUB using generator stabilization."""
|
||||||
|
text = ''
|
||||||
|
errors = []
|
||||||
|
info_messages = []
|
||||||
|
|
||||||
|
def add_error(msg):
|
||||||
|
errors.append(msg)
|
||||||
|
if indexing_progress and progress_lock:
|
||||||
|
with progress_lock:
|
||||||
|
indexing_progress['errors'].append(msg)
|
||||||
|
|
||||||
|
# Validate file existence
|
||||||
|
if not os.path.exists(epub_path):
|
||||||
|
add_error(f"File not found: {epub_path}")
|
||||||
|
return '', errors
|
||||||
|
|
||||||
|
try:
|
||||||
|
# --- EPUB Initialization ---
|
||||||
|
try:
|
||||||
|
book = epub.read_epub(epub_path)
|
||||||
|
info_messages.append(f"[MAIN] Processing EPUB: {os.path.basename(epub_path)}")
|
||||||
|
except Exception as e:
|
||||||
|
add_error(f"EPUB read failure: {str(e)}")
|
||||||
|
return '', errors
|
||||||
|
|
||||||
|
# --- Metadata Extraction ---
|
||||||
|
md = lambda ns,name: book.get_metadata(ns, name)[0][0] if book.get_metadata(ns, name) else 'N/A'
|
||||||
|
info_messages.extend([
|
||||||
|
"[METADATA]",
|
||||||
|
f"Title: {md('DC', 'title')}",
|
||||||
|
f"Creator: {md('DC', 'creator')}",
|
||||||
|
f"Language: {md('DC', 'language')}",
|
||||||
|
f"Identifier: {md('DC', 'identifier')}"
|
||||||
|
])
|
||||||
|
|
||||||
|
# --- Critical Section: Resolve Generator Early ---
|
||||||
|
try:
|
||||||
|
raw_items = book.get_items()
|
||||||
|
item_cache = list(raw_items) # Convert generator to list IMMEDIATELY
|
||||||
|
item_map = {item.id: item for item in item_cache}
|
||||||
|
info_messages.append(f"[STRUCTURE] Found {len(item_cache)} items in manifest")
|
||||||
|
except Exception as e:
|
||||||
|
add_error(f"Item collection failed: {str(e)}")
|
||||||
|
return '', errors
|
||||||
|
|
||||||
|
# --- Spine Reconciliation ---
|
||||||
|
spine_items = []
|
||||||
|
try:
|
||||||
|
spine_ids = [s[0] for s in book.spine]
|
||||||
|
spine_items = [item_map.get(sid) for sid in spine_ids]
|
||||||
|
missing = len([sid for sid in spine_ids if sid not in item_map])
|
||||||
|
|
||||||
|
info_messages.append(
|
||||||
|
f"[SPINE] Contains {len(spine_ids)} entries "
|
||||||
|
f"({len(spine_items)-missing} valid, {missing} missing)"
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
add_error(f"Spine analysis failed: {str(e)}")
|
||||||
|
|
||||||
|
# --- Content Processing ---
|
||||||
|
content_blocks = []
|
||||||
|
processed_items = 0
|
||||||
|
|
||||||
|
for item in item_cache: # Use stabilized list
|
||||||
|
if item.media_type not in {'application/xhtml+xml', 'text/html'}:
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Filter items safely
|
||||||
|
if item.size == 0:
|
||||||
|
info_messages.append(f"[SKIP] Empty item: {item.id}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try_context = f"Item {item.id} ({item.media_type})"
|
||||||
|
|
||||||
|
# Content decoding
|
||||||
|
try:
|
||||||
|
content = item.get_content().decode('utf-8-sig') # Handle BOM
|
||||||
|
except UnicodeError:
|
||||||
|
content = item.get_content().decode('latin-1', errors='replace')
|
||||||
|
|
||||||
|
# Text extraction
|
||||||
|
soup = BeautifulSoup(content, 'html.parser')
|
||||||
|
text = soup.get_text(separator='\n', strip=True)
|
||||||
|
|
||||||
|
content_blocks.append(text)
|
||||||
|
processed_items += 1
|
||||||
|
info_messages.append(f"[PROCESSED] {try_context} ({len(text)} chars)")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
add_error(f"Processing failed for {item.id}: {str(e)}")
|
||||||
|
|
||||||
|
# --- Final Assembly ---
|
||||||
|
info_messages.append(
|
||||||
|
f"[STATS] Processed {processed_items}/{len(item_cache)} items "
|
||||||
|
f"({len(content_blocks)} valid blocks)"
|
||||||
|
)
|
||||||
|
|
||||||
|
full_text = '\n'.join(info_messages) + '\n\n' + '\n'.join(content_blocks)
|
||||||
|
return full_text.strip(), errors
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
add_error(f"Critical failure: {str(e)}")
|
||||||
|
return '', errors
|
||||||
|
|
||||||
|
def extract_text_from_epub(epub_path):
|
||||||
|
text = ''
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
book = epub.read_epub(epub_path)
|
||||||
|
except Exception as e:
|
||||||
|
with progress_lock:
|
||||||
|
indexing_progress['errors'].append(f"EPUB structure error in {epub_path}: {str(e)}")
|
||||||
|
return text
|
||||||
|
|
||||||
|
# Collect all items first to handle generator issues
|
||||||
|
collected_items = []
|
||||||
|
try:
|
||||||
|
for item in book.get_items():
|
||||||
|
collected_items.append(item)
|
||||||
|
except Exception as e:
|
||||||
|
with progress_lock:
|
||||||
|
indexing_progress['errors'].append(f"Item collection failed in {epub_path}: {str(e)}")
|
||||||
|
|
||||||
|
for item in collected_items:
|
||||||
|
current_item_id = getattr(item, 'id', 'no_id')
|
||||||
|
try:
|
||||||
|
if item.media_type in ['application/xhtml+xml', 'text/html', 'application/html']:
|
||||||
|
try:
|
||||||
|
content = item.get_content()
|
||||||
|
except Exception as e:
|
||||||
|
with progress_lock:
|
||||||
|
indexing_progress['errors'].append(
|
||||||
|
f"Content extraction failed in {epub_path} item {current_item_id}: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
try:
|
||||||
|
soup = BeautifulSoup(content, 'html.parser', from_encoding='utf-8')
|
||||||
|
item_text = soup.get_text(separator='\n', strip=True)
|
||||||
|
text += f"\n{item_text}\n"
|
||||||
|
except Exception as e:
|
||||||
|
with progress_lock:
|
||||||
|
indexing_progress['errors'].append(
|
||||||
|
f"HTML parsing failed in {epub_path} item {current_item_id}: {str(e)}")
|
||||||
|
text += f"\n{content.decode('utf-8', errors='replace')}\n"
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
with progress_lock:
|
||||||
|
indexing_progress['errors'].append(
|
||||||
|
f"Unexpected error processing {epub_path} item {current_item_id}: {str(e)}")
|
||||||
|
continue
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
with progress_lock:
|
||||||
|
indexing_progress['errors'].append(f"Critical failure processing {epub_path}: {str(e)}")
|
||||||
|
|
||||||
|
return text
|
||||||
|
|
||||||
|
|
||||||
def extract_text_from_pdf(pdf_path):
|
def extract_text_from_pdf(pdf_path):
|
||||||
text = ''
|
text = ''
|
||||||
with open(pdf_path, 'rb') as pdf_file:
|
with open(pdf_path, 'rb') as pdf_file:
|
||||||
|
|
2
test.sh
Normal file
2
test.sh
Normal file
|
@ -0,0 +1,2 @@
|
||||||
|
#!/bin/bash
|
||||||
|
echo test
|
37
tests/unit/test_epub_extraction.py
Normal file
37
tests/unit/test_epub_extraction.py
Normal file
|
@ -0,0 +1,37 @@
|
||||||
|
import unittest
|
||||||
|
import os
|
||||||
|
from src.core.index import extract_text_from_epub
|
||||||
|
|
||||||
|
class TestEPUBExtraction(unittest.TestCase):
|
||||||
|
def setUp(self):
|
||||||
|
self.test_data_dir = os.path.join(os.path.dirname(__file__), '../../test_data')
|
||||||
|
self.epub_files = [
|
||||||
|
os.path.join(self.test_data_dir, f)
|
||||||
|
for f in os.listdir(self.test_data_dir)
|
||||||
|
if f.endswith('.epub')
|
||||||
|
]
|
||||||
|
self.invalid_file = os.path.join(self.test_data_dir, 'nonexistent.epub')
|
||||||
|
|
||||||
|
def test_extract_text_from_all_epubs(self):
|
||||||
|
"""Test text extraction from all EPUB files in test_data"""
|
||||||
|
for epub_path in self.epub_files:
|
||||||
|
with self.subTest(epub_file=os.path.basename(epub_path)):
|
||||||
|
text = extract_text_from_epub(epub_path)
|
||||||
|
self.assertIsInstance(text, str)
|
||||||
|
self.assertGreater(len(text), 0,
|
||||||
|
f"Extracted text should not be empty for {epub_path}")
|
||||||
|
|
||||||
|
def test_extract_text_from_invalid_file(self):
|
||||||
|
# Test error handling for non-existent file
|
||||||
|
text = extract_text_from_epub(self.invalid_file)
|
||||||
|
self.assertEqual(text, '')
|
||||||
|
|
||||||
|
def test_empty_file_handling(self):
|
||||||
|
# Create a temporary empty EPUB file
|
||||||
|
import tempfile
|
||||||
|
with tempfile.NamedTemporaryFile(suffix='.epub') as temp_epub:
|
||||||
|
text = extract_text_from_epub(temp_epub.name)
|
||||||
|
self.assertEqual(text, '')
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
unittest.main()
|
Loading…
Add table
Reference in a new issue