Update Elasticsearch configuration and increase max_result_window; bump application version to 0.0.9

This commit is contained in:
Dmitriy Kazimirov 2025-04-06 13:58:35 +00:00
parent da243e7f2f
commit 2928577477
3 changed files with 44 additions and 20 deletions

View file

@ -40,6 +40,9 @@ services:
- ELASTICSEARCH_PLUGINS=analysis-stempel
- ES_JAVA_OPTS=-Xms6g -Xmx6g
- bootstrap.memory_lock=true
- "ELASTICSEARCH_HEAP_SIZE=6g"
- "ELASTICSEARCH_EXTRA_JAVA_OPTS=-Xms6g -Xmx6g"
- "ELASTICSEARCH_CLUSTER_SETTINGS=index.max_result_window=50000"
restart: unless-stopped
deploy:
resources:

View file

@ -55,7 +55,7 @@ from markdown_it import MarkdownIt
from functools import lru_cache
from threading import Lock
# Application version
APP_VERSION = "0.0.8 (2025 Apr 6th)"
APP_VERSION = "0.0.9 (2025 Apr 6th)"
# Configuration constants
ITEMS_PER_PAGE = int(os.environ.get("ITEMS_PER_PAGE", 50)) # Default items per page

View file

@ -59,26 +59,33 @@ def are_files_indexed(file_paths):
else:
normalized_paths.append(os.path.join("/books", path))
# Build a batch query
query = {
"size": len(normalized_paths), # Return all matching documents
"query": {
"terms": {
"file_path.keyword": normalized_paths
# Process in batches to avoid Elasticsearch result window limits
batch_size = 5000 # Safe batch size below the 10000 limit
indexed_files = {}
# Process normalized paths in batches
for i in range(0, len(normalized_paths), batch_size):
batch_paths = normalized_paths[i:i+batch_size]
# Build a batch query
query = {
"size": batch_size,
"query": {
"terms": {
"file_path.keyword": batch_paths
}
}
}
}
# Execute the query
result = es.search(index=INDEX_NAME, body=query)
# Process results
indexed_files = {}
for hit in result['hits']['hits']:
path = hit['_source']['file_path']
content = hit['_source']['content']
token_count = len(content.split())
indexed_files[path] = (True, token_count)
# Execute the query
result = es.search(index=INDEX_NAME, body=query)
# Process batch results
for hit in result['hits']['hits']:
path = hit['_source']['file_path']
content = hit['_source']['content']
token_count = len(content.split())
indexed_files[path] = (True, token_count)
# Build the complete result dictionary
results = {}
@ -111,10 +118,24 @@ def create_index():
"index.refresh_interval": "5s",
"index.number_of_replicas": 0,
"index.translog.durability": "async",
"index.mapping.total_fields.limit": 10000
"index.mapping.total_fields.limit": 10000,
"index.max_result_window": 50000 # Increased from default 10000
}
}
)
else:
# Update existing index settings to increase max_result_window
try:
es.indices.put_settings(
index=INDEX_NAME,
body={
"index": {
"max_result_window": 50000
}
}
)
except Exception as e:
print(f"Warning: Could not update max_result_window setting: {e}")
# TODO: remove old version?
def extract_text_from_epub_old(epub_path):