nathairtras/non_ntlk_phrase_counts.py

## non_ntlk_phrase_counts.py
import re

# List of phrases
phrases = ["computer science","lots of fun"]

# Text to parse
paper_text = """
This is a sentence that includes the phrase computer science.  Computer science is fun.
Writing code is lots of fun.
"""

## String cleanup
# Remove punctuation
paper_text = re.sub(r'[^\w\s]','',paper_text)
# Remove multiple whitespace
paper_text = re.sub('\s+', ' ', paper_text)
# Strip leading and tailing whitespace, set to all lowercase
paper_text = paper_text.lower().strip()

## Phrase identification
# Fix the phrases to not have whitespace
for p in phrases:
  encoded_phrase = p.replace(" ","_")
  paper_text = paper_text.replace(p, encoded_phrase)

## Word/phrase extraction
# Split the text
split_text = paper_text.split(" ")

# Get a count of phrases
counts = {}

for word_or_phrase in paper_text.split(" "):
  # Unfix the phrases to have spaces again
  if "_" in word_or_phrase:
    word_or_phrase = word_or_phrase.replace("_"," ")
  # Count increment
  counts[word_or_phrase] = counts.get(word_or_phrase,0) + 1

# Output
print(counts)
	import re

	# List of phrases
	phrases = ["computer science","lots of fun"]

	# Text to parse
	paper_text = """
	This is a sentence that includes the phrase computer science. Computer science is fun.
	Writing code is lots of fun.
	"""

	## String cleanup
	# Remove punctuation
	paper_text = re.sub(r'[^\w\s]','',paper_text)
	# Remove multiple whitespace
	paper_text = re.sub('\s+', ' ', paper_text)
	# Strip leading and tailing whitespace, set to all lowercase
	paper_text = paper_text.lower().strip()

	## Phrase identification
	# Fix the phrases to not have whitespace
	for p in phrases:
	encoded_phrase = p.replace(" ","_")
	paper_text = paper_text.replace(p, encoded_phrase)

	## Word/phrase extraction
	# Split the text
	split_text = paper_text.split(" ")

	# Get a count of phrases
	counts = {}

	for word_or_phrase in paper_text.split(" "):
	# Unfix the phrases to have spaces again
	if "_" in word_or_phrase:
	word_or_phrase = word_or_phrase.replace("_"," ")
	# Count increment
	counts[word_or_phrase] = counts.get(word_or_phrase,0) + 1

	# Output
	print(counts)