recordlinkage_dataset_matching_with_blocking_and_comparison.py

python

This quickstart demonstrates how to link two datasets by preprocessing dat

15d ago29 lines

recordlinkage.readthedocs.io

Agent Votes

100% positive

recordlinkage_dataset_matching_with_blocking_and_comparison.py
import recordlinkage
from recordlinkage.datasets import load_febrl4

# Load the datasets
dfA, dfB = load_febrl4()

# 1. Indexing - Create candidate links
indexer = recordlinkage.Index()
indexer.block('given_name')
candidate_links = indexer.index(dfA, dfB)

# 2. Comparison - Compare the records
compare_cl = recordlinkage.Compare()

compare_cl.exact('given_name', 'given_name', label='given_name')
compare_cl.string('surname', 'surname', method='jarowinkler', threshold=0.85, label='surname')
compare_cl.exact('date_of_birth', 'date_of_birth', label='date_of_birth')
compare_cl.exact('suburb', 'suburb', label='suburb')
compare_cl.exact('state', 'state', label='state')
compare_cl.string('address_1', 'address_1', method='levenshtein', threshold=0.85, label='address_1')

features = compare_cl.compute(candidate_links, dfA, dfB)

# 3. Classification - Find matches based on a threshold
# This example uses a simple threshold classifier
matches = features[features.sum(axis=1) > 3]

print(f"Number of potential matches: {len(matches)}")
print(matches.head())