json_reducer.py 3.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
  1. import json
  2. import argparse
  3. import sys
  4. def tokenize_json_keys_recursive(data, key_to_token_map, token_to_key_map, token_counter):
  5. """
  6. Recursively traverses the data, tokenizing keys and building the key maps.
  7. """
  8. if isinstance(data, dict):
  9. new_dict = {}
  10. for key, value in data.items():
  11. if key not in key_to_token_map:
  12. # Generate a new token for a new key
  13. token = f"k{token_counter[0]}"
  14. key_to_token_map[key] = token
  15. token_to_key_map[token] = key
  16. token_counter[0] += 1
  17. new_key = key_to_token_map[key]
  18. new_dict[new_key] = tokenize_json_keys_recursive(value, key_to_token_map, token_to_key_map, token_counter)
  19. return new_dict
  20. elif isinstance(data, list):
  21. return [tokenize_json_keys_recursive(item, key_to_token_map, token_to_key_map, token_counter) for item in data]
  22. else:
  23. return data
  24. def tokenize_from_object(json_object):
  25. """
  26. Takes a Python dictionary or list and returns the tokenized data and the key mapper.
  27. This is the primary function for programmatic use (Mode 1).
  28. Args:
  29. json_object: The Python object (dict or list) to process.
  30. Returns:
  31. A tuple containing (tokenized_data, key_mapper).
  32. """
  33. key_to_token = {}
  34. token_to_key = {}
  35. # Use a list as a mutable integer for pass-by-reference behavior
  36. counter = [0]
  37. tokenized_data = tokenize_json_keys_recursive(json_object, key_to_token, token_to_key, counter)
  38. return tokenized_data, token_to_key
  39. def main():
  40. """
  41. Main function to handle file-based tokenization (Mode 2).
  42. """
  43. parser = argparse.ArgumentParser(
  44. description="Reduces JSON file size by tokenizing keys."
  45. )
  46. parser.add_argument(
  47. "--input-file",
  48. type=str,
  49. required=True,
  50. help="The absolute path to the input JSON file."
  51. )
  52. parser.add_argument(
  53. "--output-file",
  54. type=str,
  55. required=True,
  56. help="The absolute path to the output JSON file."
  57. )
  58. args = parser.parse_args()
  59. # --- 1. Read the input JSON file ---
  60. try:
  61. with open(args.input_file, 'r', encoding='utf-8') as f:
  62. original_data = json.load(f)
  63. except FileNotFoundError:
  64. print(f"Error: Input file not found at {args.input_file}", file=sys.stderr)
  65. sys.exit(1)
  66. except json.JSONDecodeError as e:
  67. print(f"Error: Could not parse input JSON file. {e}", file=sys.stderr)
  68. sys.exit(1)
  69. # --- 2. Tokenize the data ---
  70. tokenized_data, key_mapper = tokenize_from_object(original_data)
  71. # --- 3. Structure the final output ---
  72. final_output = {
  73. "key_mapper": key_mapper,
  74. "data": tokenized_data
  75. }
  76. # --- 4. Write to the output file ---
  77. try:
  78. with open(args.output_file, 'w', encoding='utf-8') as f:
  79. json.dump(final_output, f, separators=(',', ':'))
  80. print(f"Successfully created tokenized JSON at: {args.output_file}")
  81. except IOError as e:
  82. print(f"Error: Could not write to output file {args.output_file}. {e}", file=sys.stderr)
  83. sys.exit(1)
  84. if __name__ == '__main__':
  85. # Example of how to run from the command line:
  86. # python json_reducer.py --input-file /path/to/input.json --output-file /path/to/output.json
  87. main()