Enhance indexing functionality: add file indexing check and token count retrieval; update file and folder display to show indexing status and token counts.
This commit is contained in:
parent
41c690f621
commit
22fda22802
3 changed files with 229 additions and 10 deletions
src
|
@ -10,7 +10,7 @@ import PyPDF2
|
|||
import time
|
||||
import logging
|
||||
import multiprocessing
|
||||
from src.core.index import index_files, get_progress
|
||||
from src.core.index import index_files, get_progress, is_file_indexed
|
||||
from io import StringIO
|
||||
import sys
|
||||
import re
|
||||
|
@ -277,16 +277,29 @@ def list_files():
|
|||
item_path = os.path.join(books_dir, item_name)
|
||||
|
||||
if os.path.isdir(item_path) and not item_name.startswith('.'):
|
||||
# Count files in this directory for display
|
||||
# Count files and indexing stats in this directory
|
||||
dir_file_count = 0
|
||||
for _, _, files in os.walk(item_path):
|
||||
dir_file_count += len(files)
|
||||
dir_indexed_count = 0
|
||||
dir_token_count = 0
|
||||
|
||||
for root, _, files in os.walk(item_path):
|
||||
for file in files:
|
||||
file_path = os.path.join(root, file)
|
||||
dir_file_count += 1
|
||||
|
||||
# Check if the file is indexed and get token count
|
||||
is_indexed, token_count = is_file_indexed(file_path)
|
||||
if is_indexed:
|
||||
dir_indexed_count += 1
|
||||
dir_token_count += token_count
|
||||
|
||||
file_tree.append({
|
||||
'type': 'directory',
|
||||
'name': item_name,
|
||||
'path': item_name,
|
||||
'file_count': dir_file_count
|
||||
'file_count': dir_file_count,
|
||||
'indexed_count': dir_indexed_count,
|
||||
'token_count': dir_token_count
|
||||
})
|
||||
|
||||
# Then process files
|
||||
|
@ -302,13 +315,18 @@ def list_files():
|
|||
if len(title_parts) > 1:
|
||||
title = ' - '.join(title_parts[:-1]) # Take all but last part
|
||||
|
||||
# Check if the file is indexed and get token count
|
||||
is_indexed, token_count = is_file_indexed(item_path)
|
||||
|
||||
file_tree.append({
|
||||
'type': 'file',
|
||||
'name': item_name,
|
||||
'title': title,
|
||||
'path': item_name,
|
||||
'size': file_size,
|
||||
'size_mb': round(file_size / (1024 * 1024), 2)
|
||||
'size_mb': round(file_size / (1024 * 1024), 2),
|
||||
'is_indexed': is_indexed,
|
||||
'token_count': token_count
|
||||
})
|
||||
total_files += 1
|
||||
total_size += file_size
|
||||
|
@ -745,10 +763,29 @@ def get_folder_contents(folder_path):
|
|||
rel_path = os.path.join(decoded_path, item)
|
||||
|
||||
if os.path.isdir(item_path):
|
||||
# Count files and indexing stats in this directory
|
||||
dir_file_count = 0
|
||||
dir_indexed_count = 0
|
||||
dir_token_count = 0
|
||||
|
||||
for root, _, files in os.walk(item_path):
|
||||
for file in files:
|
||||
file_path = os.path.join(root, file)
|
||||
dir_file_count += 1
|
||||
|
||||
# Check if the file is indexed and get token count
|
||||
is_indexed, token_count = is_file_indexed(file_path)
|
||||
if is_indexed:
|
||||
dir_indexed_count += 1
|
||||
dir_token_count += token_count
|
||||
|
||||
contents.append({
|
||||
'type': 'directory',
|
||||
'name': item,
|
||||
'path': rel_path
|
||||
'path': rel_path,
|
||||
'file_count': dir_file_count,
|
||||
'indexed_count': dir_indexed_count,
|
||||
'token_count': dir_token_count
|
||||
})
|
||||
elif os.path.isfile(item_path):
|
||||
file_size = os.path.getsize(item_path)
|
||||
|
@ -759,13 +796,18 @@ def get_folder_contents(folder_path):
|
|||
if len(title_parts) > 1:
|
||||
title = ' - '.join(title_parts[:-1]) # Take all but last part
|
||||
|
||||
# Check if the file is indexed and get token count
|
||||
is_indexed, token_count = is_file_indexed(item_path)
|
||||
|
||||
contents.append({
|
||||
'type': 'file',
|
||||
'name': item,
|
||||
'title': title,
|
||||
'path': rel_path,
|
||||
'size': file_size,
|
||||
'size_mb': round(file_size / (1024 * 1024), 2)
|
||||
'size_mb': round(file_size / (1024 * 1024), 2),
|
||||
'is_indexed': is_indexed,
|
||||
'token_count': token_count
|
||||
})
|
||||
|
||||
return jsonify({
|
||||
|
|
|
@ -109,6 +109,41 @@
|
|||
.breadcrumb a:hover {
|
||||
text-decoration: underline;
|
||||
}
|
||||
|
||||
/* Indexing status styles */
|
||||
.index-status {
|
||||
display: inline-flex;
|
||||
align-items: center;
|
||||
margin-left: 10px;
|
||||
padding: 2px 6px;
|
||||
border-radius: 4px;
|
||||
font-size: 0.8em;
|
||||
}
|
||||
|
||||
.index-status.indexed {
|
||||
background-color: #e3f2fd;
|
||||
color: #1565c0;
|
||||
border: 1px solid #bbdefb;
|
||||
}
|
||||
|
||||
.index-status.not-indexed {
|
||||
background-color: #f5f5f5;
|
||||
color: #757575;
|
||||
border: 1px solid #e0e0e0;
|
||||
}
|
||||
|
||||
.index-icon {
|
||||
margin-right: 4px;
|
||||
}
|
||||
|
||||
.token-count {
|
||||
font-family: monospace;
|
||||
}
|
||||
|
||||
.folder-index-status {
|
||||
margin-left: 8px;
|
||||
font-size: 0.75em;
|
||||
}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
@ -134,6 +169,21 @@
|
|||
<span class="summary-label">Total Size:</span>
|
||||
<span>{{ total_size_mb }} MB</span>
|
||||
</div>
|
||||
<div class="summary-item">
|
||||
<span class="summary-label">Indexed Files:</span>
|
||||
<span>
|
||||
{% set indexed_count = file_tree|selectattr('is_indexed', 'defined')|selectattr('is_indexed')|list|length %}
|
||||
{{ indexed_count }} / {{ file_tree|selectattr('type', 'equalto', 'file')|list|length }}
|
||||
({{ (indexed_count / file_tree|selectattr('type', 'equalto', 'file')|list|length * 100)|round|int if file_tree|selectattr('type', 'equalto', 'file')|list|length > 0 else 0 }}%)
|
||||
</span>
|
||||
</div>
|
||||
<div class="summary-item">
|
||||
<span class="summary-label">Total Tokens:</span>
|
||||
<span>
|
||||
{% set total_tokens = file_tree|selectattr('token_count', 'defined')|map(attribute='token_count')|sum %}
|
||||
{{ total_tokens|default(0)|int }}
|
||||
</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
{% if indexing_in_progress %}
|
||||
|
@ -152,7 +202,22 @@
|
|||
<li class="folder-item" onclick="toggleFolder(this, event)">
|
||||
<div class="folder-name">
|
||||
<span class="folder-icon">📁</span>
|
||||
{{ item.name }} <span class="file-name-muted">({{ item.file_count }} files)</span>
|
||||
{{ item.name }}
|
||||
<span class="file-name-muted">
|
||||
({{ item.file_count }} files,
|
||||
{{ item.indexed_count }} indexed,
|
||||
{{ item.token_count }} tokens)
|
||||
</span>
|
||||
{% if item.indexed_count > 0 %}
|
||||
<span class="index-status indexed folder-index-status" title="{{ item.indexed_count }} of {{ item.file_count }} files indexed with {{ item.token_count }} tokens">
|
||||
<span class="index-icon">📑</span>
|
||||
<span class="token-count">{{ (item.indexed_count / item.file_count * 100)|round|int }}%</span>
|
||||
</span>
|
||||
{% else %}
|
||||
<span class="index-status not-indexed folder-index-status" title="No files indexed in this folder">
|
||||
<span class="index-icon">📄</span>
|
||||
</span>
|
||||
{% endif %}
|
||||
</div>
|
||||
<div class="folder-contents" id="folder-{{ item.path|replace('/', '-') }}">
|
||||
<!-- Contents will be populated dynamically -->
|
||||
|
@ -177,7 +242,19 @@
|
|||
{% endif %}
|
||||
</a>
|
||||
</span>
|
||||
<span class="file-size">{{ item.size_mb }} MB</span>
|
||||
<span class="file-size">
|
||||
{{ item.size_mb }} MB
|
||||
{% if item.is_indexed %}
|
||||
<span class="index-status indexed" title="File is indexed with {{ item.token_count }} tokens">
|
||||
<span class="index-icon">📑</span>
|
||||
<span class="token-count">{{ item.token_count }} tokens</span>
|
||||
</span>
|
||||
{% else %}
|
||||
<span class="index-status not-indexed" title="File is not indexed">
|
||||
<span class="index-icon">📄</span>
|
||||
</span>
|
||||
{% endif %}
|
||||
</span>
|
||||
</li>
|
||||
{% endif %}
|
||||
{% endfor %}
|
||||
|
@ -251,6 +328,42 @@
|
|||
folderNameDiv.appendChild(folderIcon);
|
||||
folderNameDiv.appendChild(document.createTextNode(item.name));
|
||||
|
||||
// Add file count and indexing info
|
||||
const fileCountSpan = document.createElement('span');
|
||||
fileCountSpan.className = 'file-name-muted';
|
||||
fileCountSpan.textContent = ` (${item.file_count} files, ${item.indexed_count} indexed, ${item.token_count} tokens)`;
|
||||
folderNameDiv.appendChild(fileCountSpan);
|
||||
|
||||
// Add indexing status indicator
|
||||
if (item.indexed_count > 0) {
|
||||
const indexStatus = document.createElement('span');
|
||||
indexStatus.className = 'index-status indexed folder-index-status';
|
||||
indexStatus.title = `${item.indexed_count} of ${item.file_count} files indexed with ${item.token_count} tokens`;
|
||||
|
||||
const indexIcon = document.createElement('span');
|
||||
indexIcon.className = 'index-icon';
|
||||
indexIcon.textContent = '📑';
|
||||
|
||||
const percentIndexed = document.createElement('span');
|
||||
percentIndexed.className = 'token-count';
|
||||
percentIndexed.textContent = `${Math.round(item.indexed_count / item.file_count * 100)}%`;
|
||||
|
||||
indexStatus.appendChild(indexIcon);
|
||||
indexStatus.appendChild(percentIndexed);
|
||||
folderNameDiv.appendChild(indexStatus);
|
||||
} else {
|
||||
const indexStatus = document.createElement('span');
|
||||
indexStatus.className = 'index-status not-indexed folder-index-status';
|
||||
indexStatus.title = 'No files indexed in this folder';
|
||||
|
||||
const indexIcon = document.createElement('span');
|
||||
indexIcon.className = 'index-icon';
|
||||
indexIcon.textContent = '📄';
|
||||
|
||||
indexStatus.appendChild(indexIcon);
|
||||
folderNameDiv.appendChild(indexStatus);
|
||||
}
|
||||
|
||||
const folderContents = document.createElement('div');
|
||||
folderContents.className = 'folder-contents';
|
||||
folderContents.id = 'folder-' + item.path.replace(/\//g, '-');
|
||||
|
@ -321,8 +434,40 @@
|
|||
|
||||
const fileSizeSpan = document.createElement('span');
|
||||
fileSizeSpan.className = 'file-size';
|
||||
|
||||
// Add file size
|
||||
fileSizeSpan.textContent = item.size_mb + ' MB';
|
||||
|
||||
// Add indexing status
|
||||
if (item.is_indexed) {
|
||||
const indexStatus = document.createElement('span');
|
||||
indexStatus.className = 'index-status indexed';
|
||||
indexStatus.title = 'File is indexed with ' + item.token_count + ' tokens';
|
||||
|
||||
const indexIcon = document.createElement('span');
|
||||
indexIcon.className = 'index-icon';
|
||||
indexIcon.textContent = '📑';
|
||||
|
||||
const tokenCount = document.createElement('span');
|
||||
tokenCount.className = 'token-count';
|
||||
tokenCount.textContent = item.token_count + ' tokens';
|
||||
|
||||
indexStatus.appendChild(indexIcon);
|
||||
indexStatus.appendChild(tokenCount);
|
||||
fileSizeSpan.appendChild(indexStatus);
|
||||
} else {
|
||||
const indexStatus = document.createElement('span');
|
||||
indexStatus.className = 'index-status not-indexed';
|
||||
indexStatus.title = 'File is not indexed';
|
||||
|
||||
const indexIcon = document.createElement('span');
|
||||
indexIcon.className = 'index-icon';
|
||||
indexIcon.textContent = '📄';
|
||||
|
||||
indexStatus.appendChild(indexIcon);
|
||||
fileSizeSpan.appendChild(indexStatus);
|
||||
}
|
||||
|
||||
listItem.appendChild(fileNameSpan);
|
||||
listItem.appendChild(fileSizeSpan);
|
||||
}
|
||||
|
|
|
@ -13,6 +13,38 @@ ELASTICSEARCH_PORT = int(os.environ.get("ELASTICSEARCH_PORT", 9200))
|
|||
es = Elasticsearch([{'host': ELASTICSEARCH_HOST, 'port': ELASTICSEARCH_PORT, 'scheme': 'http'}])
|
||||
INDEX_NAME = "book_index"
|
||||
|
||||
def is_file_indexed(file_path):
|
||||
"""Check if a file is indexed and return token count if it is"""
|
||||
try:
|
||||
# Normalize the file path to match how it's stored in the index
|
||||
if file_path.startswith("/books/"):
|
||||
normalized_path = file_path
|
||||
else:
|
||||
normalized_path = os.path.join("/books", file_path)
|
||||
|
||||
# Search for the file in the index
|
||||
query = {
|
||||
"query": {
|
||||
"term": {
|
||||
"file_path.keyword": normalized_path
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
result = es.search(index=INDEX_NAME, body=query)
|
||||
|
||||
# If we found a match, return the token count
|
||||
if result['hits']['total']['value'] > 0:
|
||||
# Get the content and count tokens (words)
|
||||
content = result['hits']['hits'][0]['_source']['content']
|
||||
token_count = len(content.split())
|
||||
return True, token_count
|
||||
|
||||
return False, 0
|
||||
except Exception as e:
|
||||
print(f"Error checking if file is indexed: {e}")
|
||||
return False, 0
|
||||
|
||||
# Global variables for progress tracking
|
||||
indexing_progress = {
|
||||
'total_files': 0,
|
||||
|
|
Loading…
Add table
Reference in a new issue