Files
romm/backend/tools/xml_diagnostics.py
Georges-Antoine Assi c87d880042 Standardize backend/tools file headers
Give all three tools the same header shape: shebang, summary docstring, a
short detail paragraph, and a "Run from the backend directory:" command
block. Adds docstrings to generate_supported_platforms.py and
xml_diagnostics.py, which previously had none.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-23 22:05:52 -04:00

97 lines
3.1 KiB
Python

#!/usr/bin/env python3
"""Diagnose a malformed XML file: bad encoding, invalid chars, parse errors.
Reads the file in chunks to surface UTF-8 issues, then runs SAX and
ElementTree parsing to report the location of structural errors, with a few
lines of context around each one.
Run from the backend directory:
uv run tools/xml_diagnostics.py <xml_file>
"""
import sys
from xml.sax.handler import ContentHandler # nosec
from defusedxml import ElementTree as ET
from defusedxml.sax import make_parser
class DiagnosticHandler(ContentHandler):
def __init__(self):
super().__init__()
self.line_number = 0
self.column_number = 0
def setDocumentLocator(self, locator):
self.locator = locator
def characters(self, content):
# Check for invalid XML characters
for char in content:
if ord(char) >= 0xFFFE or (ord(char) <= 0x1F and char not in "\n\r\t"):
print(
f"Found invalid character '0x{ord(char):04x}' at line {self.locator.getLineNumber()}, column {self.locator.getColumnNumber()}"
)
def diagnose_xml(filename):
print(f"Analyzing {filename}...")
# First, try to read the file in chunks to find encoding issues
try:
with open(filename, "rb") as f:
chunk_size = 8192
chunk_number = 0
while True:
chunk = f.read(chunk_size)
if not chunk:
break
try:
chunk.decode("utf-8")
except UnicodeDecodeError as e:
byte_pos = chunk_number * chunk_size + e.start
print(f"Found invalid UTF-8 sequence at byte position {byte_pos}")
print(f"Problematic bytes: {chunk[e.start:e.end].decode('utf-8')}")
chunk_number += 1
except Exception as e:
print(f"Error reading file: {e}")
return None
# Then try SAX parsing for detailed error reporting
parser = make_parser()
handler = DiagnosticHandler()
parser.setContentHandler(handler)
try:
parser.parse(filename)
except Exception as e:
print(f"SAX parsing error: {e}")
# Finally try ElementTree parsing
try:
ET.parse(filename)
except ET.ParseError as e:
print(f"ElementTree parsing error: {e}")
# Try to get context around the error
try:
with open(filename, encoding="utf-8") as f:
lines = f.readlines()
line_num = e.position[0]
start = max(0, line_num - 2)
end = min(len(lines), line_num + 3)
print("\nContext around error:")
for i in range(start, end):
prefix = "-> " if i + 1 == line_num else " "
print(f"{prefix}{i+1}: {lines[i].rstrip()}")
except Exception as context_error:
print(f"Could not get context: {context_error}")
if __name__ == "__main__":
if len(sys.argv) != 2:
print("Usage: uv run tools/xml_diagnostics.py <xml_file>")
sys.exit(1)
diagnose_xml(sys.argv[1])