Skip to content

Instantly share code, notes, and snippets.

@nathairtras
Created May 18, 2019 03:17
Show Gist options
  • Save nathairtras/ee25476929ce6cadd77253fbbd1cb1df to your computer and use it in GitHub Desktop.
Save nathairtras/ee25476929ce6cadd77253fbbd1cb1df to your computer and use it in GitHub Desktop.
Extracting words and known phrases without NTLK
import re
# List of phrases
phrases = ["computer science","lots of fun"]
# Text to parse
paper_text = """
This is a sentence that includes the phrase computer science. Computer science is fun.
Writing code is lots of fun.
"""
## String cleanup
# Remove punctuation
paper_text = re.sub(r'[^\w\s]','',paper_text)
# Remove multiple whitespace
paper_text = re.sub('\s+', ' ', paper_text)
# Strip leading and tailing whitespace, set to all lowercase
paper_text = paper_text.lower().strip()
## Phrase identification
# Fix the phrases to not have whitespace
for p in phrases:
encoded_phrase = p.replace(" ","_")
paper_text = paper_text.replace(p, encoded_phrase)
## Word/phrase extraction
# Split the text
split_text = paper_text.split(" ")
# Get a count of phrases
counts = {}
for word_or_phrase in paper_text.split(" "):
# Unfix the phrases to have spaces again
if "_" in word_or_phrase:
word_or_phrase = word_or_phrase.replace("_"," ")
# Count increment
counts[word_or_phrase] = counts.get(word_or_phrase,0) + 1
# Output
print(counts)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment