Skip to content

Instantly share code, notes, and snippets.

@OsmanMutlu
Created July 2, 2018 11:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save OsmanMutlu/f3b170800562d4b4cc012aeb2888552c to your computer and use it in GitHub Desktop.
Save OsmanMutlu/f3b170800562d4b4cc012aeb2888552c to your computer and use it in GitHub Desktop.
import pandas as pd
import re
import codecs
from glob import glob
from pynlpl.formats import folia
files = glob("alladjudicated/http*")
all_df = pd.DataFrame(files, columns=["filename"])
all_df["text"] = ""
all_df["places"] = ""
def prep(row):
doc = folia.Document(file=row.filename)
row.text = doc.text()
places = []
for i, sentence in enumerate(doc.sentences()):
for layer in sentence.select(folia.EntitiesLayer):
for entity in layer.select(folia.Entity):
if entity.cls == "place":
places.append(" ".join([word.text() for word in entity.wrefs() if word.text() != "village" or word.text() != "district"]))
row.places = places
return row
all_df = all_df.apply(prep,axis=1)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment