Update Elasticsearch configuration and increase max_result_window; bump application version to 0.0.9
This commit is contained in:
parent
da243e7f2f
commit
2928577477
3 changed files with 44 additions and 20 deletions
|
@ -40,6 +40,9 @@ services:
|
|||
- ELASTICSEARCH_PLUGINS=analysis-stempel
|
||||
- ES_JAVA_OPTS=-Xms6g -Xmx6g
|
||||
- bootstrap.memory_lock=true
|
||||
- "ELASTICSEARCH_HEAP_SIZE=6g"
|
||||
- "ELASTICSEARCH_EXTRA_JAVA_OPTS=-Xms6g -Xmx6g"
|
||||
- "ELASTICSEARCH_CLUSTER_SETTINGS=index.max_result_window=50000"
|
||||
restart: unless-stopped
|
||||
deploy:
|
||||
resources:
|
||||
|
|
|
@ -55,7 +55,7 @@ from markdown_it import MarkdownIt
|
|||
from functools import lru_cache
|
||||
from threading import Lock
|
||||
# Application version
|
||||
APP_VERSION = "0.0.8 (2025 Apr 6th)"
|
||||
APP_VERSION = "0.0.9 (2025 Apr 6th)"
|
||||
|
||||
# Configuration constants
|
||||
ITEMS_PER_PAGE = int(os.environ.get("ITEMS_PER_PAGE", 50)) # Default items per page
|
||||
|
|
|
@ -59,26 +59,33 @@ def are_files_indexed(file_paths):
|
|||
else:
|
||||
normalized_paths.append(os.path.join("/books", path))
|
||||
|
||||
# Build a batch query
|
||||
query = {
|
||||
"size": len(normalized_paths), # Return all matching documents
|
||||
"query": {
|
||||
"terms": {
|
||||
"file_path.keyword": normalized_paths
|
||||
# Process in batches to avoid Elasticsearch result window limits
|
||||
batch_size = 5000 # Safe batch size below the 10000 limit
|
||||
indexed_files = {}
|
||||
|
||||
# Process normalized paths in batches
|
||||
for i in range(0, len(normalized_paths), batch_size):
|
||||
batch_paths = normalized_paths[i:i+batch_size]
|
||||
|
||||
# Build a batch query
|
||||
query = {
|
||||
"size": batch_size,
|
||||
"query": {
|
||||
"terms": {
|
||||
"file_path.keyword": batch_paths
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Execute the query
|
||||
result = es.search(index=INDEX_NAME, body=query)
|
||||
|
||||
# Process results
|
||||
indexed_files = {}
|
||||
for hit in result['hits']['hits']:
|
||||
path = hit['_source']['file_path']
|
||||
content = hit['_source']['content']
|
||||
token_count = len(content.split())
|
||||
indexed_files[path] = (True, token_count)
|
||||
|
||||
# Execute the query
|
||||
result = es.search(index=INDEX_NAME, body=query)
|
||||
|
||||
# Process batch results
|
||||
for hit in result['hits']['hits']:
|
||||
path = hit['_source']['file_path']
|
||||
content = hit['_source']['content']
|
||||
token_count = len(content.split())
|
||||
indexed_files[path] = (True, token_count)
|
||||
|
||||
# Build the complete result dictionary
|
||||
results = {}
|
||||
|
@ -111,10 +118,24 @@ def create_index():
|
|||
"index.refresh_interval": "5s",
|
||||
"index.number_of_replicas": 0,
|
||||
"index.translog.durability": "async",
|
||||
"index.mapping.total_fields.limit": 10000
|
||||
"index.mapping.total_fields.limit": 10000,
|
||||
"index.max_result_window": 50000 # Increased from default 10000
|
||||
}
|
||||
}
|
||||
)
|
||||
else:
|
||||
# Update existing index settings to increase max_result_window
|
||||
try:
|
||||
es.indices.put_settings(
|
||||
index=INDEX_NAME,
|
||||
body={
|
||||
"index": {
|
||||
"max_result_window": 50000
|
||||
}
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not update max_result_window setting: {e}")
|
||||
|
||||
# TODO: remove old version?
|
||||
def extract_text_from_epub_old(epub_path):
|
||||
|
|
Loading…
Add table
Reference in a new issue