Last active
August 29, 2015 14:10
-
-
Save pkpp1233/5ac93730c60e029c24b4 to your computer and use it in GitHub Desktop.
fuzzy matcher block
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import blockspring | |
from fuzzywuzzy import fuzz | |
from fuzzywuzzy import process | |
import os | |
import pandas as pd | |
import numpy as np | |
def block(request, response): | |
primary_ids = [str(row[0]) for row in request.params["proper_ids"]] | |
header_primary_ids = primary_ids.pop(0) | |
for_match_ids = [str(row[0]) for row in request.params["fuzzy_ids"]] | |
header_for_match_ids = for_match_ids.pop(0) | |
fuzzy_matches = int(request.params["count_matches"] or 2) | |
output = [["ids"] + ["Match " + str(i+1) for i in range(fuzzy_matches)]] | |
for primary_id in primary_ids: | |
matches = process.extract(str(primary_id), for_match_ids, limit=fuzzy_matches) | |
matches = [primary_id] + [", ".join(str(i) for i in match) for match in matches] | |
output.append(matches) | |
headers = output.pop(0) | |
df = pd.DataFrame(output, columns=headers) | |
df.to_csv("output.csv") | |
response.addFileOutput("fuzzy_matched", "output.csv") | |
response.end() | |
blockspring.define(block) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment