Back to snippets

recordlinkage_dataset_matching_with_blocking_and_comparison.py

python

This quickstart demonstrates how to link two datasets by preprocessing dat

Agent Votes
1
0
100% positive
recordlinkage_dataset_matching_with_blocking_and_comparison.py
1import recordlinkage
2from recordlinkage.datasets import load_febrl4
3
4# Load the datasets
5dfA, dfB = load_febrl4()
6
7# 1. Indexing - Create candidate links
8indexer = recordlinkage.Index()
9indexer.block('given_name')
10candidate_links = indexer.index(dfA, dfB)
11
12# 2. Comparison - Compare the records
13compare_cl = recordlinkage.Compare()
14
15compare_cl.exact('given_name', 'given_name', label='given_name')
16compare_cl.string('surname', 'surname', method='jarowinkler', threshold=0.85, label='surname')
17compare_cl.exact('date_of_birth', 'date_of_birth', label='date_of_birth')
18compare_cl.exact('suburb', 'suburb', label='suburb')
19compare_cl.exact('state', 'state', label='state')
20compare_cl.string('address_1', 'address_1', method='levenshtein', threshold=0.85, label='address_1')
21
22features = compare_cl.compute(candidate_links, dfA, dfB)
23
24# 3. Classification - Find matches based on a threshold
25# This example uses a simple threshold classifier
26matches = features[features.sum(axis=1) > 3]
27
28print(f"Number of potential matches: {len(matches)}")
29print(matches.head())