Skip to content

Instantly share code, notes, and snippets.

@bowbowbow
Created December 3, 2018 16:51
Show Gist options
  • Save bowbowbow/45124dc14e886f89fa566c61a74de4cf to your computer and use it in GitHub Desktop.
Save bowbowbow/45124dc14e886f89fa566c61a74de4cf to your computer and use it in GitHub Desktop.
import utils
import nltk, datetime
from pprint import pprint
import spacy
nlp = spacy.load('en_core_web_lg')
def count_by_keyword(df, keyword):
count = 0
for index, row in df.iterrows():
print('index :', index)
title = row['title']
body = row[' body']
if keyword in title or keyword in body:
count += 1
print('[count_by_keyword] keyword:{}, count:{}'.format(keyword, count))
def find_candidates_by_keyword(df, keyword):
candidates = []
for index, row in df.iterrows():
title = row['title']
time = datetime.datetime.strptime(row[' time'], '%Y-%m-%d %H:%M:%S')
if keyword in title: candidates.append((title, time, -1))
body = row[' body']
sents = nltk.sent_tokenize(body)
for i in range(len(sents)):
sent = sents[i]
if keyword in sent:
candidates.append((sent, time, i))
print('candidates len: {} by keyword({})'.format(len(candidates), keyword))
candidates = sorted(candidates, key=lambda x: [x[1], x[2]])
for candidate in candidates:
print(candidate)
doc = nlp(candidate[0])
print([(X.text, X.label_) for X in doc.ents])
if __name__ == "__main__":
df = utils.data_load()
find_candidates_by_keyword(df, 'Warmbier')
pass
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment