Skip to content

Commit

Permalink
fixing emlink logic for tf vars
Browse files Browse the repository at this point in the history
  • Loading branch information
jw2249a committed May 13, 2024
1 parent c5af768 commit 8201242
Show file tree
Hide file tree
Showing 5 changed files with 19 additions and 19 deletions.
2 changes: 1 addition & 1 deletion src/fastlink/fastlink.jl
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ function fastLink(dfA::DataFrame, dfB::DataFrame, config::Dict{String,Any})
end

results = process_comparisons(res, emlink_configuration, _dims, parameters, tf_tables)

if length(results) == 3
return Dict("ids" => indices_to_uids(dfA[!, config["idvar"][1]],dfB[!, config["idvar"][2]],results[1].indices),
"resultsEM" => results[2],
Expand Down
4 changes: 2 additions & 2 deletions src/gammas/gammaCKpar.jl
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,7 @@ function gammaCKpar!(vecA::PooledVector,vecB::PooledVector,

# term frequency adjustment for x
tf_val_x = length(missingindices)/_dims[1]
for tf_i in missingindices
Threads.@threads for tf_i in missingindices
tf_table_x[tf_i] = max(tf_val_y, tf_minimum_u_value)
end

Expand All @@ -200,7 +200,7 @@ function gammaCKpar!(vecA::PooledVector,vecB::PooledVector,
missingindices = findall(vecB.refs .== missingvals_y)
# term frequency adjustment for y
tf_val_y = length(missingindices)/_dims[2]
for tf_i in missingindices
Threads.@threads for tf_i in missingindices
tf_table_y[tf_i] = max(tf_val_y, tf_minimum_u_value)
end

Expand Down
18 changes: 7 additions & 11 deletions src/patterns.jl
Original file line number Diff line number Diff line change
Expand Up @@ -90,16 +90,14 @@ end

function get_match_patterns(res::Vector{DiBitMatrix}, tf_tables::Dict{String, Vector{Vector{Float16}}},
tf_vars::Vector{String}, tf_indices::Vector{Int64}, isexact=Bool[])

tf_patterns = Dict("relevant_tf_indices"=>Vector{Int64}[],
"tf_denom_vals"=>Vector{Vector{Float16}}[])

"tf_denom_vals"=>Vector{Vector{Float16}}[])
matches=MatchPatterns()
N = length(res)
dimy=res[1].nrows
len=Int(res[1].data.len)
lk = ReentrantLock()
Threads.@threads for first_loc in 0:1024:len
for first_loc in 0:1024:len
last_loc = first_loc + 1024
if last_loc > len
last_loc=len
Expand All @@ -108,22 +106,20 @@ function get_match_patterns(res::Vector{DiBitMatrix}, tf_tables::Dict{String, Ve
patterns=get_local_patterns(x,N,last_loc-first_loc)
for i in eachindex(patterns.hashes)
lock(lk) do

id = findfirst(patterns.hashes[i] .=== matches.hashes)
pattern_indices = get_2Dindex.(first_loc .+ patterns.indices[i],dimy)
if isnothing(id)
push!(matches.patterns,patterns.patterns[i])
push!(matches.hashes,patterns.hashes[i])
push!(matches.indices, pattern_indices)

relevant_tf_indices = find_tf_pattern_vars(patterns.patterns[i], tf_indices)
tfi_ids = [findfirst(tfi .== tf_indices) for tfi in relevant_tf_indices]
push!(tf_patterns["relevant_tf_indices"], relevant_tf_indices)
push!(tf_patterns["tf_denom_vals"], [match_level_tf_lookup(tf_tables[tf_vars[tfi]], pattern_indices, isexact[tfi])
for tfi in relevant_tf_indices])
push!(tf_patterns["tf_denom_vals"], [match_level_tf_lookup(tf_tables[tf_vars[tfi]], pattern_indices, isexact[tfi]) for tfi in tfi_ids])
else
tfi_ids = [findfirst(tfi .== tf_indices) for tfi in tf_patterns["relevant_tf_indices"][id]]
append!(matches.indices[id], pattern_indices)

for (tfi_loc, tfi) in enumerate(tf_patterns["relevant_tf_indices"][id])
for (tfi_loc, tfi) in enumerate(tfi_ids)
append!(tf_patterns["tf_denom_vals"][id][tfi_loc], match_level_tf_lookup(tf_tables[tf_vars[tfi]], pattern_indices, isexact[tfi]))
end
end
Expand Down Expand Up @@ -224,7 +220,7 @@ function match_and_link(patterns::Vector{DiBitMatrix}, e::Dict{String, Any}, _di
e["parameters"]...)

tf_prior_weights = get_tf_adjustment_prior_weights(parameters, tf_vars)
resultsTF = generate_tf_adjustment_dict(resultsEM, tf_patterns, tf_prior_weights; base="log")
resultsTF = generate_tf_adjustment_dict(resultsEM, e, tf_vars, tf_patterns, tf_prior_weights; base="log")

if e["name"] != final_name
return patterns_to_DiBit(resultsTF, counts.indices, _dims)
Expand Down
9 changes: 6 additions & 3 deletions src/term_frequency_adjustment.jl
Original file line number Diff line number Diff line change
Expand Up @@ -72,14 +72,17 @@ function pattern_tf_adjustment!(tf_result::Dict{String, Any}, colnames::Vector{S
return nothing
end

function generate_tf_adjustment_dict(EMOutput::Dict{String,Any}, tfPatterns::Dict{String,Vector}, tf_prior_weights::Vector{Float64}; base="log2")
function generate_tf_adjustment_dict(EMOutput::Dict{String,Any},e::Dict{String, Any}, tf_vars::Vector{String}, tfPatterns::Dict{String,Vector}, tf_prior_weights::Vector{Float64}; base="log2")
tfResults = generate_tf_skeleton(EMOutput, tfPatterns["relevant_tf_indices"])
threshold_match = EMOutput["threshold_match"]
for pattern_id in collect(1:EMOutput["number_of_unique_patterns"])
colindices = tfPatterns["relevant_tf_indices"][pattern_id]
colindices = tfPatterns["relevant_tf_indices"][pattern_id]

count = EMOutput["patterns_w"].counts[pattern_id]
tf_uvals = get_tf_u_values(EMOutput["patterns_w"], colindices, pattern_id)
tf_pw = tf_prior_weights[colindices]

ci = [findfirst(v .== tf_vars) for v in e["variables"][colindices]]
tf_pw = tf_prior_weights[ci]

if tfResults[pattern_id]["tf_adjusted"]

Expand Down
5 changes: 3 additions & 2 deletions test_parameters.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,14 @@
"variables": [
{"varname": "firstname", "method": "fuzzy", "partial": true, "cut_a": 0.92, "cut_b": 0.88, "upper": true, "tf_adjust": true, "w": 0.1},
{"varname": "middlename", "method": "exact"},
{"varname": "lastname", "method": "jarowinkler"},
{"varname": "lastname", "method": "jarowinkler", "tf_adjust": true},
{"varname": "birthyear", "method": "exact"},
{
"comparisons": {
"name": "address",
"threshold_match": 0.92,
"variables": [
{"varname": "housenum", "method": "exact", "tf_adjust": true, "tf_adjustment_weight":0.5, "tf_minimum_u_value": 0.001},
{"varname": "housenum", "method": "exact", "tf_adjust": true},
{"varname": "streetname", "method": "jarowinkler", "w": 0.1, "tf_adjust": true, "tf_adjustment_weight":0.25, "tf_minimum_u_value": 0.001},
{"varname": "city", "method": "jarowinkler", "tf_adjustment_weight":0.15, "tf_adjust": true}
]
Expand Down

0 comments on commit 8201242

Please sign in to comment.