mattstein/count.py

## count.py
import os
import re
import glob
from textblob import TextBlob
from collections import Counter
from nltk.corpus import stopwords

# recursively find all the Markdown in a directory and smush it into a big string
def load_markdown_files(directory):
    md_files = glob.glob(os.path.join(directory, '**/*.md'), recursive=True)
    content = ''

    for file in md_files:
        with open(file, 'r') as f:
            content += f.read()

    return content

# !! update this with your own path
text = load_markdown_files('/path/to/directory/with/markdown')

# clean up that big string to scrape out stuff we don’t want
def clean_text(text):
    # lowercase everything
    text = text.lower()

    # remove URLs and @’s
    text = re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", text)

    # remove stop words
    stop = stopwords.words('english')
    text = " ".join([word for word in text.split() if word not in (stop)])

    return text

# get the adjectives we’re looking for
def get_top_adjectives(text):
    blob = TextBlob(clean_text(text))
    # collect adjectives based on this coding: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
    adjectives = [word for (word, tag) in blob.tags if (tag == 'JJ' or tag == 'JJR' or tag == 'JJS')]
    adjective_counts = Counter(adjectives)
    top_adjectives = adjective_counts.most_common(30)
    return top_adjectives

top_adjectives = get_top_adjectives(text)

for (word, count) in top_adjectives:
    print(f"{word}: {count}")
	import os
	import re
	import glob
	from textblob import TextBlob
	from collections import Counter
	from nltk.corpus import stopwords

	# recursively find all the Markdown in a directory and smush it into a big string
	def load_markdown_files(directory):
	md_files = glob.glob(os.path.join(directory, '*/.md'), recursive=True)
	content = ''

	for file in md_files:
	with open(file, 'r') as f:
	content += f.read()

	return content

	# !! update this with your own path
	text = load_markdown_files('/path/to/directory/with/markdown')

	# clean up that big string to scrape out stuff we don’t want
	def clean_text(text):
	# lowercase everything
	text = text.lower()

	# remove URLs and @’s
	text = re.sub(r"(@\[A-Za-z0-9]+)\|([^0-9A-Za-z \t])\|(\w+:\/\/\S+)\|^rt\|http.+?", "", text)

	# remove stop words
	stop = stopwords.words('english')
	text = " ".join([word for word in text.split() if word not in (stop)])

	return text

	# get the adjectives we’re looking for
	def get_top_adjectives(text):
	blob = TextBlob(clean_text(text))
	# collect adjectives based on this coding: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
	adjectives = [word for (word, tag) in blob.tags if (tag == 'JJ' or tag == 'JJR' or tag == 'JJS')]
	adjective_counts = Counter(adjectives)
	top_adjectives = adjective_counts.most_common(30)
	return top_adjectives

	top_adjectives = get_top_adjectives(text)

	for (word, count) in top_adjectives:
	print(f"{word}: {count}")