Update Elasticsearch configuration and increase max_result_window; bump application version to 0.0.9

2025-04-06 13:58:35 +00:00 · 2025-04-06 13:58:35 +00:00 · 2928577477
commit 2928577477
parent da243e7f2f
3 changed files with 44 additions and 20 deletions
--- a/docker-compose.yml
+++ b/docker-compose.yml
@ -40,6 +40,9 @@ services:
      - ELASTICSEARCH_PLUGINS=analysis-stempel
      - ES_JAVA_OPTS=-Xms6g -Xmx6g
      - bootstrap.memory_lock=true
+      - "ELASTICSEARCH_HEAP_SIZE=6g"
+      - "ELASTICSEARCH_EXTRA_JAVA_OPTS=-Xms6g -Xmx6g"
+      - "ELASTICSEARCH_CLUSTER_SETTINGS=index.max_result_window=50000"
    restart: unless-stopped
    deploy:
      resources:
--- a/src/api/app.py
+++ b/src/api/app.py
@ -55,7 +55,7 @@ from markdown_it import MarkdownIt
 from functools import lru_cache
 from threading import Lock
 # Application version
-APP_VERSION = "0.0.8 (2025 Apr 6th)"
+APP_VERSION = "0.0.9 (2025 Apr 6th)"

 # Configuration constants
 ITEMS_PER_PAGE = int(os.environ.get("ITEMS_PER_PAGE", 50))  # Default items per page
--- a/src/core/index.py
+++ b/src/core/index.py
@ -59,26 +59,33 @@ def are_files_indexed(file_paths):
            else:
                normalized_paths.append(os.path.join("/books", path))
        
-        # Build a batch query
-        query = {
-            "size": len(normalized_paths),  # Return all matching documents
-            "query": {
-                "terms": {
-                    "file_path.keyword": normalized_paths
+        # Process in batches to avoid Elasticsearch result window limits
+        batch_size = 5000  # Safe batch size below the 10000 limit
+        indexed_files = {}
+        
+        # Process normalized paths in batches
+        for i in range(0, len(normalized_paths), batch_size):
+            batch_paths = normalized_paths[i:i+batch_size]
+            
+            # Build a batch query
+            query = {
+                "size": batch_size,
+                "query": {
+                    "terms": {
+                        "file_path.keyword": batch_paths
+                    }
                }
            }
-        }
-        
-        # Execute the query
-        result = es.search(index=INDEX_NAME, body=query)
-        
-        # Process results
-        indexed_files = {}
-        for hit in result['hits']['hits']:
-            path = hit['_source']['file_path']
-            content = hit['_source']['content']
-            token_count = len(content.split())
-            indexed_files[path] = (True, token_count)
+            
+            # Execute the query
+            result = es.search(index=INDEX_NAME, body=query)
+            
+            # Process batch results
+            for hit in result['hits']['hits']:
+                path = hit['_source']['file_path']
+                content = hit['_source']['content']
+                token_count = len(content.split())
+                indexed_files[path] = (True, token_count)
        
        # Build the complete result dictionary
        results = {}
@ -111,10 +118,24 @@ def create_index():
                    "index.refresh_interval": "5s",
                    "index.number_of_replicas": 0,
                    "index.translog.durability": "async",
-                    "index.mapping.total_fields.limit": 10000
+                    "index.mapping.total_fields.limit": 10000,
+                    "index.max_result_window": 50000  # Increased from default 10000
                }
            }
        )
+    else:
+        # Update existing index settings to increase max_result_window
+        try:
+            es.indices.put_settings(
+                index=INDEX_NAME,
+                body={
+                    "index": {
+                        "max_result_window": 50000
+                    }
+                }
+            )
+        except Exception as e:
+            print(f"Warning: Could not update max_result_window setting: {e}")

 # TODO: remove old version?
 def extract_text_from_epub_old(epub_path):