123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104 |
- import json
- import argparse
- import sys
- def tokenize_json_keys_recursive(data, key_to_token_map, token_to_key_map, token_counter):
- """
- Recursively traverses the data, tokenizing keys and building the key maps.
- """
- if isinstance(data, dict):
- new_dict = {}
- for key, value in data.items():
- if key not in key_to_token_map:
- # Generate a new token for a new key
- token = f"k{token_counter[0]}"
- key_to_token_map[key] = token
- token_to_key_map[token] = key
- token_counter[0] += 1
-
- new_key = key_to_token_map[key]
- new_dict[new_key] = tokenize_json_keys_recursive(value, key_to_token_map, token_to_key_map, token_counter)
- return new_dict
- elif isinstance(data, list):
- return [tokenize_json_keys_recursive(item, key_to_token_map, token_to_key_map, token_counter) for item in data]
- else:
- return data
- def tokenize_from_object(json_object):
- """
- Takes a Python dictionary or list and returns the tokenized data and the key mapper.
- This is the primary function for programmatic use (Mode 1).
- Args:
- json_object: The Python object (dict or list) to process.
- Returns:
- A tuple containing (tokenized_data, key_mapper).
- """
- key_to_token = {}
- token_to_key = {}
- # Use a list as a mutable integer for pass-by-reference behavior
- counter = [0]
-
- tokenized_data = tokenize_json_keys_recursive(json_object, key_to_token, token_to_key, counter)
-
- return tokenized_data, token_to_key
- def main():
- """
- Main function to handle file-based tokenization (Mode 2).
- """
- parser = argparse.ArgumentParser(
- description="Reduces JSON file size by tokenizing keys."
- )
- parser.add_argument(
- "--input-file",
- type=str,
- required=True,
- help="The absolute path to the input JSON file."
- )
- parser.add_argument(
- "--output-file",
- type=str,
- required=True,
- help="The absolute path to the output JSON file."
- )
- args = parser.parse_args()
- # --- 1. Read the input JSON file ---
- try:
- with open(args.input_file, 'r', encoding='utf-8') as f:
- original_data = json.load(f)
- except FileNotFoundError:
- print(f"Error: Input file not found at {args.input_file}", file=sys.stderr)
- sys.exit(1)
- except json.JSONDecodeError as e:
- print(f"Error: Could not parse input JSON file. {e}", file=sys.stderr)
- sys.exit(1)
- # --- 2. Tokenize the data ---
- tokenized_data, key_mapper = tokenize_from_object(original_data)
- # --- 3. Structure the final output ---
- final_output = {
- "key_mapper": key_mapper,
- "data": tokenized_data
- }
- # --- 4. Write to the output file ---
- try:
- with open(args.output_file, 'w', encoding='utf-8') as f:
- json.dump(final_output, f, separators=(',', ':'))
- print(f"Successfully created tokenized JSON at: {args.output_file}")
- except IOError as e:
- print(f"Error: Could not write to output file {args.output_file}. {e}", file=sys.stderr)
- sys.exit(1)
- if __name__ == '__main__':
- # Example of how to run from the command line:
- # python json_reducer.py --input-file /path/to/input.json --output-file /path/to/output.json
- main()
|