import json import argparse import sys def tokenize_json_keys_recursive(data, key_to_token_map, token_to_key_map, token_counter): """ Recursively traverses the data, tokenizing keys and building the key maps. """ if isinstance(data, dict): new_dict = {} for key, value in data.items(): if key not in key_to_token_map: # Generate a new token for a new key token = f"k{token_counter[0]}" key_to_token_map[key] = token token_to_key_map[token] = key token_counter[0] += 1 new_key = key_to_token_map[key] new_dict[new_key] = tokenize_json_keys_recursive(value, key_to_token_map, token_to_key_map, token_counter) return new_dict elif isinstance(data, list): return [tokenize_json_keys_recursive(item, key_to_token_map, token_to_key_map, token_counter) for item in data] else: return data def tokenize_from_object(json_object): """ Takes a Python dictionary or list and returns the tokenized data and the key mapper. This is the primary function for programmatic use (Mode 1). Args: json_object: The Python object (dict or list) to process. Returns: A tuple containing (tokenized_data, key_mapper). """ key_to_token = {} token_to_key = {} # Use a list as a mutable integer for pass-by-reference behavior counter = [0] tokenized_data = tokenize_json_keys_recursive(json_object, key_to_token, token_to_key, counter) return tokenized_data, token_to_key def main(): """ Main function to handle file-based tokenization (Mode 2). """ parser = argparse.ArgumentParser( description="Reduces JSON file size by tokenizing keys." ) parser.add_argument( "--input-file", type=str, required=True, help="The absolute path to the input JSON file." ) parser.add_argument( "--output-file", type=str, required=True, help="The absolute path to the output JSON file." ) args = parser.parse_args() # --- 1. Read the input JSON file --- try: with open(args.input_file, 'r', encoding='utf-8') as f: original_data = json.load(f) except FileNotFoundError: print(f"Error: Input file not found at {args.input_file}", file=sys.stderr) sys.exit(1) except json.JSONDecodeError as e: print(f"Error: Could not parse input JSON file. {e}", file=sys.stderr) sys.exit(1) # --- 2. Tokenize the data --- tokenized_data, key_mapper = tokenize_from_object(original_data) # --- 3. Structure the final output --- final_output = { "key_mapper": key_mapper, "data": tokenized_data } # --- 4. Write to the output file --- try: with open(args.output_file, 'w', encoding='utf-8') as f: json.dump(final_output, f, separators=(',', ':')) print(f"Successfully created tokenized JSON at: {args.output_file}") except IOError as e: print(f"Error: Could not write to output file {args.output_file}. {e}", file=sys.stderr) sys.exit(1) if __name__ == '__main__': # Example of how to run from the command line: # python json_reducer.py --input-file /path/to/input.json --output-file /path/to/output.json main()