Enhance indexing functionality: add file indexing check and token count retrieval; update file and folder display to show indexing status and token counts.

This commit is contained in:
Dmitriy Kazimirov 2025-04-06 08:43:26 +00:00
parent 41c690f621
commit 22fda22802
3 changed files with 229 additions and 10 deletions
src
api
core

View file

@ -10,7 +10,7 @@ import PyPDF2
import time
import logging
import multiprocessing
from src.core.index import index_files, get_progress
from src.core.index import index_files, get_progress, is_file_indexed
from io import StringIO
import sys
import re
@ -277,16 +277,29 @@ def list_files():
item_path = os.path.join(books_dir, item_name)
if os.path.isdir(item_path) and not item_name.startswith('.'):
# Count files in this directory for display
# Count files and indexing stats in this directory
dir_file_count = 0
for _, _, files in os.walk(item_path):
dir_file_count += len(files)
dir_indexed_count = 0
dir_token_count = 0
for root, _, files in os.walk(item_path):
for file in files:
file_path = os.path.join(root, file)
dir_file_count += 1
# Check if the file is indexed and get token count
is_indexed, token_count = is_file_indexed(file_path)
if is_indexed:
dir_indexed_count += 1
dir_token_count += token_count
file_tree.append({
'type': 'directory',
'name': item_name,
'path': item_name,
'file_count': dir_file_count
'file_count': dir_file_count,
'indexed_count': dir_indexed_count,
'token_count': dir_token_count
})
# Then process files
@ -302,13 +315,18 @@ def list_files():
if len(title_parts) > 1:
title = ' - '.join(title_parts[:-1]) # Take all but last part
# Check if the file is indexed and get token count
is_indexed, token_count = is_file_indexed(item_path)
file_tree.append({
'type': 'file',
'name': item_name,
'title': title,
'path': item_name,
'size': file_size,
'size_mb': round(file_size / (1024 * 1024), 2)
'size_mb': round(file_size / (1024 * 1024), 2),
'is_indexed': is_indexed,
'token_count': token_count
})
total_files += 1
total_size += file_size
@ -745,10 +763,29 @@ def get_folder_contents(folder_path):
rel_path = os.path.join(decoded_path, item)
if os.path.isdir(item_path):
# Count files and indexing stats in this directory
dir_file_count = 0
dir_indexed_count = 0
dir_token_count = 0
for root, _, files in os.walk(item_path):
for file in files:
file_path = os.path.join(root, file)
dir_file_count += 1
# Check if the file is indexed and get token count
is_indexed, token_count = is_file_indexed(file_path)
if is_indexed:
dir_indexed_count += 1
dir_token_count += token_count
contents.append({
'type': 'directory',
'name': item,
'path': rel_path
'path': rel_path,
'file_count': dir_file_count,
'indexed_count': dir_indexed_count,
'token_count': dir_token_count
})
elif os.path.isfile(item_path):
file_size = os.path.getsize(item_path)
@ -759,13 +796,18 @@ def get_folder_contents(folder_path):
if len(title_parts) > 1:
title = ' - '.join(title_parts[:-1]) # Take all but last part
# Check if the file is indexed and get token count
is_indexed, token_count = is_file_indexed(item_path)
contents.append({
'type': 'file',
'name': item,
'title': title,
'path': rel_path,
'size': file_size,
'size_mb': round(file_size / (1024 * 1024), 2)
'size_mb': round(file_size / (1024 * 1024), 2),
'is_indexed': is_indexed,
'token_count': token_count
})
return jsonify({

View file

@ -109,6 +109,41 @@
.breadcrumb a:hover {
text-decoration: underline;
}
/* Indexing status styles */
.index-status {
display: inline-flex;
align-items: center;
margin-left: 10px;
padding: 2px 6px;
border-radius: 4px;
font-size: 0.8em;
}
.index-status.indexed {
background-color: #e3f2fd;
color: #1565c0;
border: 1px solid #bbdefb;
}
.index-status.not-indexed {
background-color: #f5f5f5;
color: #757575;
border: 1px solid #e0e0e0;
}
.index-icon {
margin-right: 4px;
}
.token-count {
font-family: monospace;
}
.folder-index-status {
margin-left: 8px;
font-size: 0.75em;
}
</style>
</head>
<body>
@ -134,6 +169,21 @@
<span class="summary-label">Total Size:</span>
<span>{{ total_size_mb }} MB</span>
</div>
<div class="summary-item">
<span class="summary-label">Indexed Files:</span>
<span>
{% set indexed_count = file_tree|selectattr('is_indexed', 'defined')|selectattr('is_indexed')|list|length %}
{{ indexed_count }} / {{ file_tree|selectattr('type', 'equalto', 'file')|list|length }}
({{ (indexed_count / file_tree|selectattr('type', 'equalto', 'file')|list|length * 100)|round|int if file_tree|selectattr('type', 'equalto', 'file')|list|length > 0 else 0 }}%)
</span>
</div>
<div class="summary-item">
<span class="summary-label">Total Tokens:</span>
<span>
{% set total_tokens = file_tree|selectattr('token_count', 'defined')|map(attribute='token_count')|sum %}
{{ total_tokens|default(0)|int }}
</span>
</div>
</div>
{% if indexing_in_progress %}
@ -152,7 +202,22 @@
<li class="folder-item" onclick="toggleFolder(this, event)">
<div class="folder-name">
<span class="folder-icon">📁</span>
{{ item.name }} <span class="file-name-muted">({{ item.file_count }} files)</span>
{{ item.name }}
<span class="file-name-muted">
({{ item.file_count }} files,
{{ item.indexed_count }} indexed,
{{ item.token_count }} tokens)
</span>
{% if item.indexed_count > 0 %}
<span class="index-status indexed folder-index-status" title="{{ item.indexed_count }} of {{ item.file_count }} files indexed with {{ item.token_count }} tokens">
<span class="index-icon">📑</span>
<span class="token-count">{{ (item.indexed_count / item.file_count * 100)|round|int }}%</span>
</span>
{% else %}
<span class="index-status not-indexed folder-index-status" title="No files indexed in this folder">
<span class="index-icon">📄</span>
</span>
{% endif %}
</div>
<div class="folder-contents" id="folder-{{ item.path|replace('/', '-') }}">
<!-- Contents will be populated dynamically -->
@ -177,7 +242,19 @@
{% endif %}
</a>
</span>
<span class="file-size">{{ item.size_mb }} MB</span>
<span class="file-size">
{{ item.size_mb }} MB
{% if item.is_indexed %}
<span class="index-status indexed" title="File is indexed with {{ item.token_count }} tokens">
<span class="index-icon">📑</span>
<span class="token-count">{{ item.token_count }} tokens</span>
</span>
{% else %}
<span class="index-status not-indexed" title="File is not indexed">
<span class="index-icon">📄</span>
</span>
{% endif %}
</span>
</li>
{% endif %}
{% endfor %}
@ -251,6 +328,42 @@
folderNameDiv.appendChild(folderIcon);
folderNameDiv.appendChild(document.createTextNode(item.name));
// Add file count and indexing info
const fileCountSpan = document.createElement('span');
fileCountSpan.className = 'file-name-muted';
fileCountSpan.textContent = ` (${item.file_count} files, ${item.indexed_count} indexed, ${item.token_count} tokens)`;
folderNameDiv.appendChild(fileCountSpan);
// Add indexing status indicator
if (item.indexed_count > 0) {
const indexStatus = document.createElement('span');
indexStatus.className = 'index-status indexed folder-index-status';
indexStatus.title = `${item.indexed_count} of ${item.file_count} files indexed with ${item.token_count} tokens`;
const indexIcon = document.createElement('span');
indexIcon.className = 'index-icon';
indexIcon.textContent = '📑';
const percentIndexed = document.createElement('span');
percentIndexed.className = 'token-count';
percentIndexed.textContent = `${Math.round(item.indexed_count / item.file_count * 100)}%`;
indexStatus.appendChild(indexIcon);
indexStatus.appendChild(percentIndexed);
folderNameDiv.appendChild(indexStatus);
} else {
const indexStatus = document.createElement('span');
indexStatus.className = 'index-status not-indexed folder-index-status';
indexStatus.title = 'No files indexed in this folder';
const indexIcon = document.createElement('span');
indexIcon.className = 'index-icon';
indexIcon.textContent = '📄';
indexStatus.appendChild(indexIcon);
folderNameDiv.appendChild(indexStatus);
}
const folderContents = document.createElement('div');
folderContents.className = 'folder-contents';
folderContents.id = 'folder-' + item.path.replace(/\//g, '-');
@ -321,8 +434,40 @@
const fileSizeSpan = document.createElement('span');
fileSizeSpan.className = 'file-size';
// Add file size
fileSizeSpan.textContent = item.size_mb + ' MB';
// Add indexing status
if (item.is_indexed) {
const indexStatus = document.createElement('span');
indexStatus.className = 'index-status indexed';
indexStatus.title = 'File is indexed with ' + item.token_count + ' tokens';
const indexIcon = document.createElement('span');
indexIcon.className = 'index-icon';
indexIcon.textContent = '📑';
const tokenCount = document.createElement('span');
tokenCount.className = 'token-count';
tokenCount.textContent = item.token_count + ' tokens';
indexStatus.appendChild(indexIcon);
indexStatus.appendChild(tokenCount);
fileSizeSpan.appendChild(indexStatus);
} else {
const indexStatus = document.createElement('span');
indexStatus.className = 'index-status not-indexed';
indexStatus.title = 'File is not indexed';
const indexIcon = document.createElement('span');
indexIcon.className = 'index-icon';
indexIcon.textContent = '📄';
indexStatus.appendChild(indexIcon);
fileSizeSpan.appendChild(indexStatus);
}
listItem.appendChild(fileNameSpan);
listItem.appendChild(fileSizeSpan);
}

View file

@ -13,6 +13,38 @@ ELASTICSEARCH_PORT = int(os.environ.get("ELASTICSEARCH_PORT", 9200))
es = Elasticsearch([{'host': ELASTICSEARCH_HOST, 'port': ELASTICSEARCH_PORT, 'scheme': 'http'}])
INDEX_NAME = "book_index"
def is_file_indexed(file_path):
"""Check if a file is indexed and return token count if it is"""
try:
# Normalize the file path to match how it's stored in the index
if file_path.startswith("/books/"):
normalized_path = file_path
else:
normalized_path = os.path.join("/books", file_path)
# Search for the file in the index
query = {
"query": {
"term": {
"file_path.keyword": normalized_path
}
}
}
result = es.search(index=INDEX_NAME, body=query)
# If we found a match, return the token count
if result['hits']['total']['value'] > 0:
# Get the content and count tokens (words)
content = result['hits']['hits'][0]['_source']['content']
token_count = len(content.split())
return True, token_count
return False, 0
except Exception as e:
print(f"Error checking if file is indexed: {e}")
return False, 0
# Global variables for progress tracking
indexing_progress = {
'total_files': 0,