talemate/tests/test_utils_data.py
2025-04-08 12:27:21 +03:00

259 lines
No EOL
8.1 KiB
Python

import os
import pytest
import json
import yaml
from talemate.util.data import (
fix_faulty_json,
extract_json,
extract_json_v2,
extract_yaml_v2,
JSONEncoder,
DataParsingError
)
# Helper function to get test data paths
def get_test_data_path(filename):
base_dir = os.path.dirname(os.path.abspath(__file__))
return os.path.join(base_dir, 'data', 'util', 'data', filename)
def test_json_encoder():
"""Test JSONEncoder handles unknown types by converting to string."""
class CustomObject:
def __str__(self):
return "CustomObject"
# Create an object of a custom class
custom_obj = CustomObject()
# Encode it using JSONEncoder
encoded = json.dumps({"obj": custom_obj}, cls=JSONEncoder)
# Check if the object was converted to a string
assert encoded == '{"obj": "CustomObject"}'
def test_fix_faulty_json():
"""Test fix_faulty_json function with various faulty JSON strings."""
# Test adjacent objects - need to wrap in list brackets to make it valid JSON
fixed = fix_faulty_json('{"a": 1}{"b": 2}')
assert fixed == '{"a": 1},{"b": 2}'
# We need to manually wrap it in brackets for the test
assert json.loads('[' + fixed + ']') == [{"a": 1}, {"b": 2}]
# Test trailing commas
assert json.loads(fix_faulty_json('{"a": 1, "b": 2,}')) == {"a": 1, "b": 2}
assert json.loads(fix_faulty_json('{"a": [1, 2, 3,]}')) == {"a": [1, 2, 3]}
def test_extract_json():
"""Test extract_json function to extract JSON from the beginning of a string."""
# Simple test
json_str, obj = extract_json('{"name": "test", "value": 42} and some text')
assert json_str == '{"name": "test", "value": 42}'
assert obj == {"name": "test", "value": 42}
# Test with array
json_str, obj = extract_json('[1, 2, 3] and some text')
assert json_str == '[1, 2, 3]'
assert obj == [1, 2, 3]
# Test with whitespace
json_str, obj = extract_json(' {"name": "test"} and some text')
assert json_str == '{"name": "test"}'
assert obj == {"name": "test"}
# Test with invalid JSON
with pytest.raises(ValueError):
extract_json('This is not JSON')
def test_extract_json_v2_valid():
"""Test extract_json_v2 with valid JSON in code blocks."""
# Load test data
with open(get_test_data_path('valid_json.txt'), 'r') as f:
text = f.read()
# Extract JSON
result = extract_json_v2(text)
# Check if we got two unique JSON objects (third is a duplicate)
assert len(result) == 2
# Check if the objects are correct
expected_first = {
"name": "Test Object",
"properties": {
"id": 1,
"active": True
},
"tags": ["test", "json", "parsing"]
}
expected_second = {
"name": "Simple Object",
"value": 42
}
assert expected_first in result
assert expected_second in result
def test_extract_json_v2_invalid():
"""Test extract_json_v2 raises DataParsingError for invalid JSON."""
# Load test data
with open(get_test_data_path('invalid_json.txt'), 'r') as f:
text = f.read()
# Try to extract JSON, should raise DataParsingError
with pytest.raises(DataParsingError):
extract_json_v2(text)
def test_extract_json_v2_faulty():
"""Test extract_json_v2 with faulty but fixable JSON."""
# Load test data
with open(get_test_data_path('faulty_json.txt'), 'r') as f:
text = f.read()
# Try to extract JSON, should successfully fix and extract some objects
# but might fail on the severely malformed ones
try:
result = extract_json_v2(text)
# If it manages to fix all JSON, verify the results
assert len(result) > 0
except DataParsingError:
# This is also acceptable if some JSON is too broken to fix
pass
def test_data_parsing_error():
"""Test the DataParsingError class."""
# Create a DataParsingError with a message and data
test_data = '{"broken": "json"'
error = DataParsingError("Test error message", test_data)
# Check properties
assert error.message == "Test error message"
assert error.data == test_data
assert str(error) == "Test error message"
def test_extract_json_v2_multiple():
"""Test extract_json_v2 with multiple JSON objects including duplicates."""
# Load test data
with open(get_test_data_path('multiple_json.txt'), 'r') as f:
text = f.read()
# Extract JSON
result = extract_json_v2(text)
# Check if we got the correct number of unique objects (3 unique out of 5 total)
assert len(result) == 3
# Define expected objects
expected_objects = [
{
"id": 1,
"name": "First Object",
"tags": ["one", "first", "primary"]
},
{
"id": 2,
"name": "Second Object",
"tags": ["two", "second"]
},
{
"id": 3,
"name": "Third Object",
"metadata": {
"created": "2023-01-01",
"version": 1.0
},
"active": True
}
]
# Check if all expected objects are in the result
for expected in expected_objects:
assert expected in result
# Verify that each object appears exactly once (no duplicates)
id_counts = {}
for obj in result:
id_counts[obj["id"]] = id_counts.get(obj["id"], 0) + 1
# Each ID should appear exactly once
for id_val, count in id_counts.items():
assert count == 1, f"Object with ID {id_val} appears {count} times (should be 1)"
def test_extract_yaml_v2_valid():
"""Test extract_yaml_v2 with valid YAML in code blocks."""
# Load test data
with open(get_test_data_path('valid_yaml.txt'), 'r') as f:
text = f.read()
# Extract YAML
result = extract_yaml_v2(text)
# Check if we got two unique YAML objects (third is a duplicate)
assert len(result) == 2
# Check if the objects are correct
expected_first = {
"name": "Test Object",
"properties": {
"id": 1,
"active": True
},
"tags": ["test", "yaml", "parsing"]
}
expected_second = {
"simple_name": "Simple Object",
"value": 42
}
assert expected_first in result
assert expected_second in result
def test_extract_yaml_v2_invalid():
"""Test extract_yaml_v2 raises DataParsingError for invalid YAML."""
# Load test data
with open(get_test_data_path('invalid_yaml.txt'), 'r') as f:
text = f.read()
# Try to extract YAML, should raise DataParsingError
with pytest.raises(DataParsingError):
extract_yaml_v2(text)
def test_extract_yaml_v2_multiple():
"""Test extract_yaml_v2 with multiple YAML objects including duplicates."""
# Load test data
with open(get_test_data_path('multiple_yaml.txt'), 'r') as f:
text = f.read()
# Extract YAML
result = extract_yaml_v2(text)
# Check if we got the correct number of unique objects (3 unique out of 5 total)
assert len(result) == 3
# Get the objects by ID for easier assertions
objects_by_id = {obj["id"]: obj for obj in result}
# Check for object 1
assert objects_by_id[1]["name"] == "First Object"
assert objects_by_id[1]["tags"] == ["one", "first", "primary"]
# Check for object 2
assert objects_by_id[2]["name"] == "Second Object"
assert objects_by_id[2]["tags"] == ["two", "second"]
# Check for object 3 - note that the date is parsed as a date object by YAML
assert objects_by_id[3]["name"] == "Third Object"
assert objects_by_id[3]["active"] is True
assert "created" in objects_by_id[3]["metadata"]
# Verify that each object ID appears exactly once (no duplicates)
id_counts = {}
for obj in result:
id_counts[obj["id"]] = id_counts.get(obj["id"], 0) + 1
# Each ID should appear exactly once
for id_val, count in id_counts.items():
assert count == 1, f"Object with ID {id_val} appears {count} times (should be 1)"