refactor: rework how symlinks are processed (no longer resolve) (#248)

Some changes to how we handle symlinks. We no longer resolve them, which should reduce the complexity by a nice bit.

We also now show the target name in the output.

I also added a launch.json file for debugging because it took me a while to figure out how to get the debugger to work.

Yeah, that's it.

Please test before merging because I'm a bit of a dingus sometimes
This commit is contained in:
Nicolas Iragne 2025-04-02 01:35:20 +02:00 committed by GitHub
parent 8be6f5620f
commit cdeadf510d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
4 changed files with 55 additions and 32 deletions

12
.vscode/launch.json vendored Normal file
View file

@ -0,0 +1,12 @@
{
"configurations": [
{
"name": "Python Debugger: Module",
"type": "debugpy",
"request": "launch",
"module": "uvicorn",
"args": ["server.main:app", "--host", "0.0.0.0", "--port", "8000"],
"cwd": "${workspaceFolder}/src"
}
]
}

View file

@ -9,7 +9,6 @@ from gitingest.output_formatters import format_node
from gitingest.query_parsing import IngestionQuery
from gitingest.schemas import FileSystemNode, FileSystemNodeType, FileSystemStats
from gitingest.utils.ingestion_utils import _should_exclude, _should_include
from gitingest.utils.path_utils import _is_safe_symlink
try:
import tomllib # type: ignore[import]
@ -171,11 +170,6 @@ def _process_node(
The parsed query object containing information about the repository and query parameters.
stats : FileSystemStats
Statistics tracking object for the total file count and size.
Raises
------
ValueError
If an unexpected error occurs during processing.
"""
if limit_exceeded(stats, node.depth):
@ -183,28 +177,15 @@ def _process_node(
for sub_path in node.path.iterdir():
symlink_path = None
if sub_path.is_symlink():
if not _is_safe_symlink(sub_path, query.local_path):
print(f"Skipping unsafe symlink: {sub_path}")
continue
symlink_path = sub_path
sub_path = sub_path.resolve()
if sub_path in stats.visited:
print(f"Skipping already visited path: {sub_path}")
continue
stats.visited.add(sub_path)
if query.ignore_patterns and _should_exclude(sub_path, query.local_path, query.ignore_patterns):
continue
if query.include_patterns and not _should_include(sub_path, query.local_path, query.include_patterns):
continue
if sub_path.is_file():
if sub_path.is_symlink():
_process_symlink(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path)
elif sub_path.is_file():
_process_file(path=sub_path, parent_node=node, stats=stats, local_path=query.local_path)
elif sub_path.is_dir():
@ -216,11 +197,6 @@ def _process_node(
depth=node.depth + 1,
)
# rename the subdir to reflect the symlink name
if symlink_path:
child_directory_node.name = symlink_path.name
child_directory_node.path_str = str(symlink_path)
_process_node(
node=child_directory_node,
query=query,
@ -230,13 +206,41 @@ def _process_node(
node.size += child_directory_node.size
node.file_count += child_directory_node.file_count
node.dir_count += 1 + child_directory_node.dir_count
else:
raise ValueError(f"Unexpected error: {sub_path} is neither a file nor a directory")
print(f"Warning: {sub_path} is an unknown file type, skipping")
node.sort_children()
def _process_symlink(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None:
"""
Process a symlink in the file system.
This function checks the symlink's target.
Parameters
----------
path : Path
The full path of the symlink.
parent_node : FileSystemNode
The parent directory node.
stats : FileSystemStats
Statistics tracking object for the total file count and size.
local_path : Path
The base path of the repository or directory being processed.
"""
child = FileSystemNode(
name=path.name,
type=FileSystemNodeType.SYMLINK,
path_str=str(path.relative_to(local_path)),
path=path,
depth=parent_node.depth + 1,
)
stats.total_files += 1
parent_node.children.append(child)
parent_node.file_count += 1
def _process_file(path: Path, parent_node: FileSystemNode, stats: FileSystemStats, local_path: Path) -> None:
"""
Process a file in the file system.

View file

@ -31,7 +31,7 @@ def format_node(node: FileSystemNode, query: IngestionQuery) -> Tuple[str, str,
if node.type == FileSystemNodeType.DIRECTORY:
summary += f"Files analyzed: {node.file_count}\n"
else:
elif node.type == FileSystemNodeType.FILE:
summary += f"File: {node.name}\n"
summary += f"Lines: {len(node.content.splitlines()):,}\n"
@ -101,7 +101,7 @@ def _gather_file_contents(node: FileSystemNode) -> str:
str
The concatenated content of all files under the given node.
"""
if node.type == FileSystemNodeType.FILE:
if node.type != FileSystemNodeType.DIRECTORY:
return node.content_string
# Recursively gather contents of all files under the current directory
@ -142,6 +142,8 @@ def _create_tree_structure(query: IngestionQuery, node: FileSystemNode, prefix:
display_name = node.name
if node.type == FileSystemNodeType.DIRECTORY:
display_name += "/"
elif node.type == FileSystemNodeType.SYMLINK:
display_name += " -> " + node.path.readlink().name
tree_str += f"{prefix}{current_prefix}{display_name}\n"

View file

@ -18,6 +18,7 @@ class FileSystemNodeType(Enum):
DIRECTORY = auto()
FILE = auto()
SYMLINK = auto()
@dataclass
@ -91,7 +92,8 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes
"""
parts = [
SEPARATOR,
f"File: {str(self.path_str).replace(os.sep, '/')}",
f"{self.type.name}: {str(self.path_str).replace(os.sep, '/')}"
+ (f" -> {self.path.readlink().name}" if self.type == FileSystemNodeType.SYMLINK else ""),
SEPARATOR,
f"{self.content}",
]
@ -116,6 +118,9 @@ class FileSystemNode: # pylint: disable=too-many-instance-attributes
if self.type == FileSystemNodeType.DIRECTORY:
raise ValueError("Cannot read content of a directory node")
if self.type == FileSystemNodeType.SYMLINK:
return ""
if not is_text_file(self.path):
return "[Non-text file]"