Skip to content

Instantly share code, notes, and snippets.

@noveoko
Last active March 13, 2022 21:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save noveoko/4dd753b5da3da7b0f437a2c1ff796fe7 to your computer and use it in GitHub Desktop.
Save noveoko/4dd753b5da3da7b0f437a2c1ff796fe7 to your computer and use it in GitHub Desktop.
Create a dataset for predicting origin based on name
#pip install -U pip setuptools wheel
!pip install -U spacy
!python -m spacy download en_core_web_sm
import spacy
import csv
import re
import string
nlp = spacy.load("en_core_web_sm")
rows_written = 0
errors_found = 0
def clean_string(string):
return re.sub('[\W_]+', ' ', string)
import spacy
import csv
import re
import string
nlp = spacy.load("en_core_web_sm")
rows_written = 0
errors_found = 0
def clean_string(string):
return re.sub('[\W_]+', ' ', string)
with open("BIRTHS.csv", "w", encoding="utf-8") as outfile:
writer = csv.DictWriter(outfile, fieldnames=["PERSON","DATE","LOCATION"])
writer.writeheader()
with open(path, 'r', encoding='utf-8') as infile:
while True:
lines = infile.readlines(400)
for line in lines:
try:
if "born in" in line:
doc = nlp(line)
for sent in doc.sents:
if 'born in' in sent.text:
people = []
for ent in sent.ents:
if ent.label_ == "PERSON":
birth_info = {"PERSON":clean_string(ent.text),"LOCATION":None,"DATE":None}
people.append(birth_info)
elif ent.label_ == "GPE" and people[-1]["PERSON"]:
people[-1]["LOCATION"] = clean_string(ent.text)
elif ent.label_ == "DATE" and people[-1]["PERSON"]:
people[-1]["DATE"] = ent.text
if all([a for a in birth_info.values()]):
writer.writerow(birth_info)
rows_written+=1
print(f"Total rows written: {rows_written}", end="\r")
except Exception as ee:
errors_found +=1
#print(f"Errors found: {errors_found}", end="\r")
continue
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment