Terra
/
terra-llm


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104
							

import json
import argparse
import sys

def tokenize_json_keys_recursive(data, key_to_token_map, token_to_key_map, token_counter):
    """
    Recursively traverses the data, tokenizing keys and building the key maps.
    """
    if isinstance(data, dict):
        new_dict = {}
        for key, value in data.items():
            if key not in key_to_token_map:
                # Generate a new token for a new key
                token = f"k{token_counter[0]}"
                key_to_token_map[key] = token
                token_to_key_map[token] = key
                token_counter[0] += 1
            
            new_key = key_to_token_map[key]
            new_dict[new_key] = tokenize_json_keys_recursive(value, key_to_token_map, token_to_key_map, token_counter)
        return new_dict
    elif isinstance(data, list):
        return [tokenize_json_keys_recursive(item, key_to_token_map, token_to_key_map, token_counter) for item in data]
    else:
        return data

def tokenize_from_object(json_object):
    """
    Takes a Python dictionary or list and returns the tokenized data and the key mapper.
    This is the primary function for programmatic use (Mode 1).

    Args:
        json_object: The Python object (dict or list) to process.

    Returns:
        A tuple containing (tokenized_data, key_mapper).
    """
    key_to_token = {}
    token_to_key = {}
    # Use a list as a mutable integer for pass-by-reference behavior
    counter = [0] 
    
    tokenized_data = tokenize_json_keys_recursive(json_object, key_to_token, token_to_key, counter)
    
    return tokenized_data, token_to_key

def main():
    """
    Main function to handle file-based tokenization (Mode 2).
    """
    parser = argparse.ArgumentParser(
        description="Reduces JSON file size by tokenizing keys."
    )
    parser.add_argument(
        "--input-file",
        type=str,
        required=True,
        help="The absolute path to the input JSON file."
    )
    parser.add_argument(
        "--output-file",
        type=str,
        required=True,
        help="The absolute path to the output JSON file."
    )
    args = parser.parse_args()

    # --- 1. Read the input JSON file ---
    try:
        with open(args.input_file, 'r', encoding='utf-8') as f:
            original_data = json.load(f)
    except FileNotFoundError:
        print(f"Error: Input file not found at {args.input_file}", file=sys.stderr)
        sys.exit(1)
    except json.JSONDecodeError as e:
        print(f"Error: Could not parse input JSON file. {e}", file=sys.stderr)
        sys.exit(1)

    # --- 2. Tokenize the data ---
    tokenized_data, key_mapper = tokenize_from_object(original_data)

    # --- 3. Structure the final output ---
    final_output = {
        "key_mapper": key_mapper,
        "data": tokenized_data
    }

    # --- 4. Write to the output file ---
    try:
        with open(args.output_file, 'w', encoding='utf-8') as f:
            json.dump(final_output, f, separators=(',', ':'))
        print(f"Successfully created tokenized JSON at: {args.output_file}")
    except IOError as e:
        print(f"Error: Could not write to output file {args.output_file}. {e}", file=sys.stderr)
        sys.exit(1)


if __name__ == '__main__':
    # Example of how to run from the command line:
    # python json_reducer.py --input-file /path/to/input.json --output-file /path/to/output.json
    main()