json_utils.py 2.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172
  1. import json
  2. import sys
  3. from pathlib import Path
  4. # Ensure the reducer is importable
  5. try:
  6. from json_reducer import tokenize_from_object
  7. except ImportError:
  8. print("Warning: json_reducer could not be imported. The 'shrink' feature will be disabled.", file=sys.stderr)
  9. # Define a dummy function if the import fails
  10. def tokenize_from_object(data):
  11. return data, {}
  12. def _remove_empty_keys(data):
  13. """
  14. Recursively removes keys from dictionaries that have None, empty list, or empty dict values.
  15. """
  16. if isinstance(data, dict):
  17. # Rebuild the dictionary, only including items that don't have empty values.
  18. # Recursively process the values to clean nested structures.
  19. return {k: _remove_empty_keys(v) for k, v in data.items() if v is not None and v != [] and v != {}}
  20. elif isinstance(data, list):
  21. # Recursively process each item in the list.
  22. return [_remove_empty_keys(item) for item in data]
  23. else:
  24. # Return all other data types (strings, numbers, etc.) as is.
  25. return data
  26. def write_json(data, file_path, indent=None, shrink=False, ensure_ascii=False):
  27. """
  28. Centralized function to write Python objects to a JSON file.
  29. Args:
  30. data: The Python object (e.g., dict, list) to serialize.
  31. file_path: The path to the output file.
  32. indent: The indentation level for pretty-printing. Defaults to None (compact).
  33. shrink: If True, tokenizes JSON keys and removes empty values to reduce size.
  34. ensure_ascii: Whether to escape non-ASCII characters. Defaults to False.
  35. """
  36. final_data = data
  37. output_indent = indent
  38. if shrink:
  39. # First, remove any keys with empty values from the original data.
  40. cleaned_data = _remove_empty_keys(data)
  41. # Then, tokenize the keys of the cleaned data.
  42. tokenized_data, key_mapper = tokenize_from_object(cleaned_data)
  43. final_data = {
  44. "key_mapper": key_mapper,
  45. "data": tokenized_data
  46. }
  47. # When shrinking, always use compact formatting for maximum size reduction.
  48. output_indent = None
  49. # If indent is 0, it should be treated as compact (None for json.dump)
  50. if output_indent == 0:
  51. output_indent = None
  52. try:
  53. with open(file_path, 'w', encoding='utf-8') as f:
  54. if output_indent is None:
  55. json.dump(final_data, f, ensure_ascii=ensure_ascii, separators=(',', ':'))
  56. else:
  57. json.dump(final_data, f, indent=output_indent, ensure_ascii=ensure_ascii)
  58. except IOError as e:
  59. print(f"Error writing JSON to {file_path}: {e}", file=sys.stderr)
  60. raise
  61. except TypeError as e:
  62. print(f"Error serializing data to JSON: {e}", file=sys.stderr)
  63. raise