Skip to content

Custom Anonymization Operators

While Allyanonimiser provides several built-in anonymization operators, you can also create custom operators to meet specific requirements. Custom operators give you complete control over how detected entities are transformed.

Creating a Custom Operator

A custom operator is a function that takes two parameters: 1. entity_text - The text of the detected entity 2. entity_type - The type of the detected entity (e.g., "PERSON", "EMAIL_ADDRESS")

The function should return the transformed text that will replace the original entity.

Basic Example

from allyanonimiser import create_allyanonimiser

# Define a simple custom operator
def reverse_text(entity_text, entity_type):
    """Reverse the text of the entity."""
    return entity_text[::-1]

# Create an Allyanonimiser instance
ally = create_allyanonimiser()

# Use the custom operator
result = ally.anonymize(
    text="Customer John Smith sent an email about their policy.",
    operators={
        "PERSON": reverse_text  # Pass the function directly
    }
)

print(result["text"])
# Output: "Customer htimS nhoJ sent an email about their policy."

Advanced Custom Operator Examples

Randomized Name Replacement

This example replaces detected person names with random names from a predefined list, while maintaining consistency:

from allyanonimiser import create_allyanonimiser
import hashlib

def randomize_names(entity_text, entity_type):
    """Replace person names with random names from a predefined list."""
    if entity_type != "PERSON":
        return entity_text

    # Simple list of random replacement names
    replacements = ["Alex Taylor", "Sam Johnson", "Jordan Lee", "Casey Brown", 
                   "Morgan Smith", "Jamie Williams", "Riley Garcia", "Taylor Wilson"]

    # Use hash of original name to consistently select the same replacement
    hash_val = int(hashlib.md5(entity_text.encode()).hexdigest(), 16)
    index = hash_val % len(replacements)

    return replacements[index]

# Create an Allyanonimiser instance
ally = create_allyanonimiser()

# Use the custom operator
result = ally.anonymize(
    text="Customer John Smith contacted Mary Johnson about their policy.",
    operators={
        "PERSON": randomize_names
    }
)

print(result["text"])
# Output might be: "Customer Alex Taylor contacted Sam Johnson about their policy."

Format-Preserving Encryption

This operator preserves the format of credit card numbers while anonymizing them:

from allyanonimiser import create_allyanonimiser
import hashlib

def format_preserving_cc(entity_text, entity_type):
    """
    Replace credit card numbers with fake ones that pass Luhn check
    but preserve the format and issuer prefix.
    """
    if entity_type != "CREDIT_CARD":
        return entity_text

    # Remove all non-digits
    digits = ''.join(c for c in entity_text if c.isdigit())

    # Preserve the first 6 digits (issuer identification)
    prefix = digits[:6]

    # Generate replacement digits for the rest
    hash_obj = hashlib.md5((digits + "salt").encode())
    hash_hex = hash_obj.hexdigest()

    # Convert hex hash to decimal digits
    hash_dec = ''.join(str(int(c, 16) % 10) for c in hash_hex)

    # Create new number with preserved prefix
    new_digits = prefix + hash_dec[:(len(digits) - 6 - 1)]

    # Generate a valid Luhn check digit
    check_digit = generate_luhn_digit(new_digits)
    new_digits += str(check_digit)

    # Reapply the original formatting
    result = ''
    digit_index = 0
    for char in entity_text:
        if char.isdigit() and digit_index < len(new_digits):
            result += new_digits[digit_index]
            digit_index += 1
        else:
            result += char

    return result

# Helper function for Luhn algorithm
def generate_luhn_digit(partial_number):
    """Generate the last digit for a number to make it pass the Luhn check."""
    digits = [int(d) for d in partial_number]

    # Double every second digit from right to left
    for i in range(len(digits) - 1, -1, -2):
        digits[i] *= 2
        if digits[i] > 9:
            digits[i] -= 9

    # Calculate check digit
    total = sum(digits)
    return (10 - (total % 10)) % 10

# Create an Allyanonimiser instance
ally = create_allyanonimiser()

# Use the custom operator
result = ally.anonymize(
    text="Payment made with card 4111-2222-3333-4444 expiring 12/25",
    operators={
        "CREDIT_CARD": format_preserving_cc
    }
)

print(result["text"])
# Output preserves format: "Payment made with card 4111-XXXX-XXXX-XXXX expiring 12/25"

Differential Privacy Operator

This example adds noise to numerical values to implement differential privacy:

from allyanonimiser import create_allyanonimiser
import re
import random

def differential_privacy_number(entity_text, entity_type):
    """
    Apply differential privacy to numerical values by adding
    Laplace noise proportional to the value.
    """
    if entity_type != "MONETARY_VALUE":
        return entity_text

    # Extract the numerical value
    match = re.search(r'[\d,.]+', entity_text)
    if not match:
        return entity_text

    # Convert to float
    try:
        value_str = match.group(0).replace(',', '')
        value = float(value_str)
    except ValueError:
        return entity_text

    # Add Laplace noise with scale proportional to the value
    epsilon = 0.1  # Privacy parameter (smaller = more privacy)
    scale = max(1.0, abs(value) * 0.05) / epsilon
    noise = random.normalvariate(0, scale)
    new_value = value + noise

    # Round to appropriate precision
    decimals = len(value_str.split('.')[-1]) if '.' in value_str else 0
    new_value = round(new_value, decimals)

    # Format with commas for thousands
    new_value_str = f"{new_value:,.{decimals}f}"

    # Replace the value in the original string
    return entity_text.replace(match.group(0), new_value_str)

# Create an Allyanonimiser instance
ally = create_allyanonimiser()

# Use the custom operator
result = ally.anonymize(
    text="The policy premium is $1,234.56 per year.",
    operators={
        "MONETARY_VALUE": differential_privacy_number
    }
)

print(result["text"])
# Output might be: "The policy premium is $1,246.32 per year."

Stateful Custom Operators

You can also create stateful operators by using closures or classes. This is useful when you need to maintain state across multiple anonymizations.

Class-Based Operator

from allyanonimiser import create_allyanonimiser

class EntityCounter:
    """A custom operator that keeps count of entity types seen."""

    def __init__(self):
        self.counts = {}

    def __call__(self, entity_text, entity_type):
        """Make the class instance callable as a function."""
        # Update count for this entity type
        self.counts[entity_type] = self.counts.get(entity_type, 0) + 1

        # Return a replacement with the count
        return f"<{entity_type}_{self.counts[entity_type]}>"

    def get_stats(self):
        """Return the current statistics."""
        return self.counts

# Create an Allyanonimiser instance
ally = create_allyanonimiser()

# Create the stateful operator
counter_op = EntityCounter()

# Use the custom operator
result = ally.anonymize(
    text="John Smith and Jane Doe both live in Sydney.",
    operators={
        "PERSON": counter_op,
        "LOCATION": counter_op
    }
)

print(result["text"])
# Output: "<PERSON_1> and <PERSON_2> both live in <LOCATION_1>."

# Get statistics from the operator
print(counter_op.get_stats())
# Output: {'PERSON': 2, 'LOCATION': 1}

Integration with AnonymizationConfig

You can also use custom operators with the AnonymizationConfig object:

from allyanonimiser import create_allyanonimiser, AnonymizationConfig

# Define a custom operator
def uppercase_operator(entity_text, entity_type):
    return entity_text.upper()

# Create a configuration with the custom operator
config = AnonymizationConfig(
    operators={
        "PERSON": uppercase_operator,
        "EMAIL_ADDRESS": "mask",
        "POLICY_NUMBER": "replace"
    }
)

# Create an Allyanonimiser instance
ally = create_allyanonimiser()

# Use the configuration
result = ally.anonymize(
    text="John Smith (john.smith@example.com) has policy POL-123456.",
    config=config
)

print(result["text"])
# Output: "JOHN SMITH (j***.s****@e******.com) has policy <POLICY_NUMBER>."

See Also